[Beignet] [PATCH V3] Backend: Refine block read/write instruction selection

Xiuli Pan xiuli.pan at intel.com
Fri Dec 16 09:49:56 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Move the block pack/unpack into instruction selection in order to get
optimization. Also change some variable name to avoid misleading.
And make some new function in GenEncoder class.
V2: Use ud8grf instead of f8grf to save a retype.
V3: Merge chagne name patch and fix some comments.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen8_encoder.cpp       |  40 ++-
 backend/src/backend/gen_context.cpp        | 459 ++---------------------------
 backend/src/backend/gen_encoder.cpp        | 105 ++++---
 backend/src/backend/gen_encoder.hpp        |  18 +-
 backend/src/backend/gen_insn_selection.cpp | 439 ++++++++++++++++++++-------
 5 files changed, 432 insertions(+), 629 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 8f73346..39dcfd3 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -840,20 +840,15 @@ namespace gbe
     gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
   }
 
-  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
-   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1;
-    uint32_t rsize = size / 2;
-    uint32_t msgsize = size;
-    // When size is 1 OWord, which means half a reg, we need to know which half to use
-    if (size == 1) {
-      if (dst.subnr == 0)
-        msgsize = 0;
-      else
-        msgsize = 1;
-    }
-    rsize = rsize == 0 ? 1 : rsize;
-    const uint32_t response_length = rsize; // Size is in regs
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+    const uint32_t response_length = sizeinreg; // Size is in reg
+
     this->setHeader(insn);
     this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
@@ -861,21 +856,22 @@ namespace gbe
     setOBlockRWA64(this,
                    insn,
                    bti,
-                   msgsize,
+                   block_size,
                    GEN8_P1_BLOCK_READ_A64,
                    msg_length,
                    response_length);
 
   }
 
-  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t size) {
-   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    uint32_t rsize = size / 2;
-    rsize = rsize == 0 ? 1 : rsize;
-    const uint32_t msg_length = 1 + rsize; // Size is in owords
+  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
     const uint32_t response_length = 0;
-    uint32_t msgsize = size;
-    msgsize = msgsize == 1 ? 0 : msgsize;
+    const uint32_t block_size = getOBlockSize(ow_size);
+
     this->setHeader(insn);
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
@@ -883,7 +879,7 @@ namespace gbe
     setOBlockRWA64(this,
                    insn,
                    bti,
-                   msgsize,
+                   block_size,
                    GEN8_P1_BLOCK_WRITE_A64,
                    msg_length,
                    response_length);
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 8288fa5..791e607 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3551,458 +3551,39 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst= ra->genReg(insn.dst(1));
-    const GenRegister addrreg = ra->genReg(insn.src(0));
-    uint32_t type = dst.type;
-    uint32_t typesize = typeSize(type);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
-    const uint32_t simdWidth = p->curr.execWidth;
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-    GenRegister headeraddr;
-    bool isA64 = insn.getbti() == 255;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
     if (isA64)
-      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+       p->OBREADA64(tmp, header, bti, ow_size);
     else
-      headeraddr = GenRegister::offset(header, 0, 2*4);
-
-    // Make header
-    p->push();
-    {
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0, 0));
-
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      p->MOV(headeraddr, addr);
-
-      // Put zero in the general state base address
-      if (!isA64)
-        p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-
-    }
-    p->pop();
-    // Now read the data, oword block read can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64) {
-          //p->curr.execWidth = 8;
-          p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
-        }
-        else
-          p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-        else
-          p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-      }
-      p->pop();
-      p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize ));
-    } else if (vec_size == 4) {
-      if (simdWidth == 8) {
-        p->push();
-        {
-          p->curr.execWidth = 16;
-          p->curr.noMask = 1;
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
-        }
-        p->pop();
-        for (uint32_t j = 0; j < 4; j++)
-          p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
-      } else {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 8);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 8 / typesize ; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
-        }
-      }
-    } else if (vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            if (isA64)
-              p->OBREADA64(tmp, header, insn.getbti(), 8);
-            else
-              p->OBREAD(tmp, header, insn.getbti(), 8);
-          }
-          p->pop();
-          for (uint32_t j = 0; j < 16 / typesize; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
-        }
-      } else {
-        for (uint32_t i = 0; i < typesize ; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 8);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
-        }
-      }
-    } else NOT_SUPPORTED;
+       p->OBREAD(tmp, header, bti, ow_size);
   }
 
   void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister addrreg = ra->genReg(insn.src(0));
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    uint32_t type = ra->genReg(insn.src(1)).type;
-    uint32_t typesize = typeSize(type);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-    GenRegister headeraddr;
-    bool isA64 = insn.getbti() == 255;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
     if (isA64)
-      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+       p->OBWRITEA64(header, bti, ow_size);
     else
-      headeraddr = GenRegister::offset(header, 0, 2*4);
-    const uint32_t simdWidth = p->curr.execWidth;
-    uint32_t tmp_size = simdWidth * vec_size / 8;
-    tmp_size = tmp_size > 4 ? 4 : tmp_size;
-    uint32_t offset_size = isA64 ? 128 : 8;
-
-    p->push();
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0,0));
-
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      if (isA64)
-        p->MOV(headeraddr, addr);
-      else
-        p->SHR(headeraddr, addr, GenRegister::immud(4));
-
-      // Put zero in the general state base address
-      if (!isA64)
-        p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
-    p->pop();
-    // Now write the data, oword block write can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
-        else
-          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1)));
-      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
-        else
-          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
-      }
-      p->pop();
-    } else if (vec_size == 4) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < 4; i++)
-          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
-        p->push();
-        {
-          p->curr.execWidth = 16;
-          p->curr.noMask = 1;
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
-          else
-            p->OBWRITE(header, insn.getbti(), 2 * typesize);
-        }
-        p->pop();
-      } else {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 8);
-          else
-            p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else if (vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          for (uint32_t j = 0; j < 16 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            if (isA64)
-              p->OBWRITEA64(header, insn.getbti(), 8);
-            else
-              p->OBWRITE(header, insn.getbti(), 8);
-          }
-          p->pop();
-        }
-      } else {
-        for (uint32_t i = 0; i < typesize; i++) {
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 8);
-          else
-            p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else NOT_SUPPORTED;
-
+       p->OBWRITE(header, bti, ow_size);
   }
 
   void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(1));
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
-    const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
-    const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
-    size_t vec_size = insn.extra.elem;
-    uint32_t type = dst.type;
-    uint32_t typesize = typeSize(type);
-    uint32_t block_width = typesize * simdWidth;
-    uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // ushort in simd8 will have half reg, but response lenght is still 1
-        uint32_t rsize = vec_size * typesize / 4;
-        rsize = rsize ? rsize : 1;
-        // Now read the data
-        p->MBREAD(dst, header, insn.getbti(), rsize);
-      p->pop();
-
-    }
-    else if (simdWidth == 16)
-    {
-      const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(tmp, header, insn.getbti(), vec_size);
-        for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
-
-        if (typesize == 4)
-        {
-          // Second half
-          // Update the header with the coord
-          p->curr.execWidth = 1;
-          p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-          // Now read the data
-          p->curr.execWidth = 8;
-          p->MBREAD(tmp, header, insn.getbti(), vec_size);
-
-          // Move the reg to fit vector rule.
-          for (uint32_t i = 0; i < vec_size; i++)
-            p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
-                   GenRegister::offset(tmp, i));
-        }
-      p->pop();
-    } else NOT_IMPLEMENTED;
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister header = ra->genReg(insn.src(0));
+    const size_t response_size = insn.extra.elem;
+    p->MBREAD(dst, header, insn.getbti(), response_size);
   }
 
   void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    GenRegister offsetx, offsety, blocksizereg;
-    size_t vec_size = insn.extra.elem;
-    uint32_t type = ra->genReg(insn.src(2)).type;
-    uint32_t typesize = typeSize(type);
-    uint32_t block_width = typesize * simdWidth;
-    uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
-
-    offsetx = GenRegister::offset(header, 0, 0*4);
-    offsety = GenRegister::offset(header, 0, 1*4);
-    blocksizereg = GenRegister::offset(header, 0, 2*4);
-
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type),
-                 ra->genReg(insn.src(2 + i)));
-        // ushort in simd8 will have half reg, but reponse lenght is still 1
-        uint32_t rsize = vec_size * typesize / 4;
-        rsize = rsize ? rsize : 1;
-        // Now read the data
-        p->MBWRITE(header, insn.getbti(), rsize);
-      p->pop();
-
-    }
-    else
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
-        p->MBWRITE(header, insn.getbti(), vec_size);
-
-        if (typesize == 4)
-        {
-          // Second half
-          // Update the header with the coord
-          p->curr.execWidth = 1;
-          p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-          p->curr.execWidth = 8;
-          // Mov what we need into msgs
-          for(uint32_t i = 0; i < vec_size; i++)
-            p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
-          // Now write the data
-          p->MBWRITE(header, insn.getbti(), vec_size);
-        }
-
-      p->pop();
-    }
+    const GenRegister header = ra->genReg(insn.dst(0));
+    const size_t data_size = insn.extra.elem;
+    p->MBWRITE(header, insn.getbti(), data_size);
   }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 49d93e8..1bca668 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -257,32 +257,47 @@ namespace gbe
       NOT_SUPPORTED;
   }
 
-  static void setOBlockRW(GenEncoder *p,
-                          GenNativeInstruction *insn,
-                          uint32_t bti,
-                          uint32_t size,
-                          uint32_t msg_type,
-                          uint32_t msg_length,
-                          uint32_t response_length)
+  void GenEncoder::setOBlockRW(GenNativeInstruction *insn,
+                               uint32_t bti,
+                               uint32_t block_size,
+                               uint32_t msg_type,
+                               uint32_t msg_length,
+                               uint32_t response_length)
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
-    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
-    assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
     insn->bits3.gen7_oblock_rw.bti = bti;
-    insn->bits3.gen7_oblock_rw.block_size = size <=  2 ? size : (size == 4 ? 3 : 4);
+    insn->bits3.gen7_oblock_rw.block_size = block_size;
     insn->bits3.gen7_oblock_rw.header_present = 1;
   }
 
-  static void setMBlockRW(GenEncoder *p,
-                          GenNativeInstruction *insn,
-                          uint32_t bti,
-                          uint32_t msg_type,
-                          uint32_t msg_length,
-                          uint32_t response_length)
+  uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half)
+  {
+    /* 000: 1 OWord, read into or written from the low 128 bits of the destination register.
+     * 001: 1 OWord, read into or written from the high 128 bits of the destination register.
+     * 010: 2 OWords
+     * 011: 4 OWords
+     * 100: 8 OWords */
+    switch(oword_size)
+    {
+      case 1: return low_half ? 0 : 1;
+      case 2: return 2;
+      case 4: return 3;
+      case 8: return 4;
+      default: NOT_SUPPORTED;
+    }
+    return 0;
+  }
+
+  void GenEncoder::setMBlockRW(GenNativeInstruction *insn,
+                               uint32_t bti,
+                               uint32_t msg_type,
+                               uint32_t msg_length,
+                               uint32_t response_length)
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
-    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_mblock_rw.msg_type = msg_type;
     insn->bits3.gen7_mblock_rw.bti = bti;
     insn->bits3.gen7_mblock_rw.header_present = 1;
@@ -1312,80 +1327,72 @@ namespace gbe
      setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
   }
 
-  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1;
-    uint32_t rsize = size / 2;
-    uint32_t msgsize = size;
-    // When size is 1 OWord, which means half a reg, we need to know which half to use
-    if (size == 1) {
-      if (dst.subnr == 0)
-        msgsize = 0;
-      else
-        msgsize = 1;
-    }
-    rsize = rsize == 0 ? 1 : rsize;
-    const uint32_t response_length = rsize; // Size is in regs
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
+    const uint32_t response_length = sizeinreg; // Size is in reg
+
     this->setHeader(insn);
     this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setOBlockRW(this,
-                insn,
+    setOBlockRW(insn,
                 bti,
-                msgsize,
+                block_size,
                 GEN7_UNALIGNED_OBLOCK_READ,
                 msg_length,
                 response_length);
   }
 
-  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    uint32_t rsize = size / 2;
-    rsize = rsize == 0 ? 1 : rsize;
-    const uint32_t msg_length = 1 + rsize; // Size is in owords
+    uint32_t sizeinreg = ow_size / 2;
+    // half reg should also have size 1
+    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
+    const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
     const uint32_t response_length = 0;
-    uint32_t msgsize = size;
-    msgsize = msgsize == 1 ? 0 : msgsize;
+    const uint32_t block_size = getOBlockSize(ow_size);
+
     this->setHeader(insn);
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
-    setOBlockRW(this,
-                insn,
+    setOBlockRW(insn,
                 bti,
-                msgsize,
+                block_size,
                 GEN7_OBLOCK_WRITE,
                 msg_length,
                 response_length);
   }
 
-  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1;
-    const uint32_t response_length = size; // Size of registers
+    const uint32_t response_length = response_size; // Size of registers
     this->setHeader(insn);
     this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setMBlockRW(this,
-                insn,
+    setMBlockRW(insn,
                 bti,
                 GEN75_P1_MEDIA_BREAD,
                 msg_length,
                 response_length);
   }
 
-  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t data_size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    const uint32_t msg_length = 1 + size;
+    const uint32_t msg_length = 1 + data_size;
     const uint32_t response_length = 0; // Size of registers
     this->setHeader(insn);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
-    setMBlockRW(this,
-                insn,
+    setMBlockRW(insn,
                 bti,
                 GEN75_P1_MEDIA_TYPED_BWRITE,
                 msg_length,
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index e5eb2e2..5c4b210 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -286,18 +286,24 @@ namespace gbe
     virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
                             GenRegister src1 = GenRegister::null());
     virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1 = GenRegister::null());
+
+    /*! OBlock helper function */
+    uint32_t getOBlockSize(uint32_t oword_size, bool low_half = false);
+    void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+    void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t block_size, uint32_t msg_type, uint32_t msg_length, uint32_t response_lengtha);
+
     /*! OBlock read */
-    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! OBlock write */
-    void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    void OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! MBlock read */
-    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t response_size);
     /*! MBlock write */
-    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t data_size);
     /*! A64 OBlock read */
-    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! A64 OBlock write */
-    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t elemSize);
+    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t ow_size);
 
     GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1cd6137..da52f6a 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -759,13 +759,13 @@ namespace gbe
     void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
                       GenRegister tmpData1, GenRegister tmpData2);
     /*! Oblock read */
-    void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t ow_size);
     /*! Oblock write */
-    void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t ow_size);
     /*! Media block read */
-    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, uint32_t bti, uint32_t response_size);
     /*! Media block write */
-    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBWRITE(GenRegister header, GenRegister* values, uint32_t tmp_size, uint32_t bti, uint32_t data_size);
 
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -2267,118 +2267,84 @@ namespace gbe
   }
   void Selection::Opaque::OBREAD(GenRegister* dsts,
                                  uint32_t vec_size,
-                                 GenRegister addr,
                                  GenRegister header,
                                  uint32_t bti,
-                                 GenRegister* tmp,
-                                 uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+                                 uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, 1);
     SelectionVector *vector = this->appendVector();
-    insn->dst(0) = header;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = dsts[i];
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(1 + i + vec_size) = tmp[i];
-    insn->src(0) = addr;
+      insn->dst(i) = dsts[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // number of vector size
+    insn->extra.elem = ow_size; // number of OWord size
 
     // tmp regs for OWORD read dst
-    vector->regNum = tmp_size;
-    vector->reg = &insn->dst(1 + vec_size);
-    vector->offsetID = 1 + vec_size;
+    vector->regNum = vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
     vector->isSrc = 0;
   }
 
-  void Selection::Opaque::OBWRITE(GenRegister addr,
+  void Selection::Opaque::OBWRITE(GenRegister header,
                                   GenRegister* values,
                                   uint32_t vec_size,
-                                  GenRegister header,
                                   uint32_t bti,
-                                  GenRegister* tmp,
-                                  uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
+                                  uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, vec_size + 1);
     SelectionVector *vector = this->appendVector();
-    insn->src(0) = addr;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
       insn->src(i + 1) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(i + 1) = tmp[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // number of vector_size
+    insn->extra.elem = ow_size; // number of OWord_size
 
-    // tmp regs for OWORD read dst
-    vector->regNum = tmp_size + 1;
-    vector->reg = &insn->dst(0);
+    // tmp regs for OWORD write header and values
+    vector->regNum = vec_size + 1;
+    vector->reg = &insn->src(0);
     vector->offsetID = 0;
-    vector->isSrc = 0;
+    vector->isSrc = 1;
+
   }
 
   void Selection::Opaque::MBREAD(GenRegister* dsts,
-                                 GenRegister coordx,
-                                 GenRegister coordy,
+                                 uint32_t tmp_size,
                                  GenRegister header,
-                                 GenRegister* tmp,
                                  uint32_t bti,
-                                 uint32_t vec_size) {
-
-    uint32_t simdWidth = curr.execWidth;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i) {
-      insn->dst(i + 1) = dsts[i];
-      if(simdWidth == 16)
-        insn->dst(i + vec_size + 1) = tmp[i];
-    }
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
+                                 uint32_t response_size) {
+
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+    insn->src(0) = header;
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = response_size; // send response length
 
-    // Only in simd 8 the data is in vector form
-    if(simdWidth == 8) {
-      SelectionVector *vector = this->appendVector();
-      vector->regNum = vec_size;
-      vector->reg = &insn->dst(1);
-      vector->offsetID = 1;
-      vector->isSrc = 0;
-    }
-    if(simdWidth == 16)
-    {
-      SelectionVector *vectortmp = this->appendVector();
-      vectortmp->regNum = vec_size;
-      vectortmp->reg = &insn->dst(vec_size + 1);
-      vectortmp->offsetID = vec_size + 1;
-      vectortmp->isSrc = 0;
+    for (uint32_t i = 0; i < tmp_size; ++i) {
+      insn->dst(i) = dsts[i];
     }
+    SelectionVector *vector = this->appendVector();
+    vector->regNum = tmp_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
   }
 
-  void Selection::Opaque::MBWRITE(GenRegister coordx,
-                                  GenRegister coordy,
+  void Selection::Opaque::MBWRITE(GenRegister header,
                                   GenRegister* values,
-                                  GenRegister header,
-                                  GenRegister* tmp,
+                                  uint32_t tmp_size,
                                   uint32_t bti,
-                                  uint32_t vec_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+                                  uint32_t data_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + tmp_size);
     SelectionVector *vector = this->appendVector();
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->src(2 + i) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = tmp[i];
-    insn->state = this->curr;
+    insn->src(0) = header;
+    for (uint32_t i = 0; i < tmp_size; ++i)
+      insn->src(1 + i) = values[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = data_size; // msg data part size
 
     // We need to put the header and the data together
-    vector->regNum = 1 + vec_size;
+    vector->regNum = 1 + tmp_size;
     vector->reg = &insn->dst(0);
     vector->offsetID = 0;
-    vector->isSrc = 0;
+    vector->isSrc = 1;
   }
 
   // Boiler plate to initialize the selection library at c++ pre-main
@@ -4715,18 +4681,79 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Type type = insn.getValueType();
       const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
       vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
         valuesVec.push_back(sel.selReg(insn.getValue(i), type));
-      // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
-      tmp_size = tmp_size == 0 ? 1 : tmp_size;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
-      vector<GenRegister> tmpVec;
+
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+
+      /* For block read we need to unpack the block date into values, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block read send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+            }
+            sel.pop();
+          }
+          sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+      }
+
     }
 
     // check whether all binded table index point to constant memory
@@ -5161,18 +5188,80 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Type type = insn.getValueType();
       const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
       vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
         valuesVec.push_back(sel.selReg(insn.getValue(i), type));
-      // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
-      tmp_size = tmp_size == 0 ? 1 : tmp_size;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
-      vector<GenRegister> tmpVec;
+
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+
+      /* For block write we need to pack the block date into the tmp, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block write send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            sel.pop();
+          }
+          sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+      }
+
+
     }
 
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
@@ -7662,20 +7751,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint32_t vec_size = insn.getVectorSize();
       uint32_t simdWidth = sel.curr.execWidth;
       const Type type = insn.getType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      uint32_t response_size = simdWidth * vec_size * typeSize / 32;
+      // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1
+      response_size = response_size ? response_size : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for (uint32_t i = 0; i < vec_size; ++i) {
         valuesVec.push_back(sel.selReg(insn.getDst(i), type));
-        if(simdWidth == 16)
-          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
       }
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      GenRegister *tmp = NULL;
-      if(simdWidth == 16)
-        tmp = &tmpVec[0];
-      sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
+
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+
+      if (simdWidth * typeSize < 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(valuesVec[i], tmpVec[i]);
+
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          sel.curr.execWidth = 8;
+          sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), vec_size);
+
+          // Move the reg to fit vector rule.
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+        sel.pop();
+      } else NOT_IMPLEMENTED;
+
+
       return true;
     }
     DECL_CTOR(MediaBlockReadInstruction, 1, 1);
@@ -7689,17 +7835,84 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
       const Type type = insn.getType();
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      uint32_t simdWidth = sel.curr.execWidth;
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      // ushort in simd8 will have half reg, but data lenght is still 1
+      uint32_t data_size = simdWidth * vec_size * typeSize / 32;
+      data_size = data_size? data_size : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+
+
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
-      for(uint32_t i = 0; i < vec_size; i++)
-      {
-        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      }
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+      for (uint32_t i = 0; i < vec_size; ++i) {
+         valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
+        else
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+       }
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
+
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+
+      if (simdWidth * typeSize < 64) {
+        for (uint32_t i = 0; i < vec_size; ++i) {
+            sel.MOV(tmpVec[i], valuesVec[i]);
+        }
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], valuesVec[i]);
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          sel.curr.execWidth = 8;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), vec_size);
+
+          // Move the reg to fit vector rule.
+        sel.pop();
+      } else NOT_IMPLEMENTED;
+
       return true;
     }
     DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
-- 
2.7.4



More information about the Beignet mailing list