[Beignet] [PATCH] Backend: Refine block read/write instruction selection

Pan, Xiuli xiuli.pan at intel.com
Wed Dec 14 08:41:31 UTC 2016


1. I will refine them with ud8grf

2. The mov was used for vector. %65 %66 %67 %68 %69 are in a vector. If we replace it with %65 %56 %57 %58 %59, then the vector will have a longer liveness. These may case some register pressure. These MOVs is used to minimize the liveness of vectors.

-----Original Message-----
From: Guo, Yejun 
Sent: Wednesday, December 14, 2016 3:43 PM
To: Pan, Xiuli <xiuli.pan at intel.com>; beignet at lists.freedesktop.org
Cc: Pan, Xiuli <xiuli.pan at intel.com>
Subject: RE: [Beignet] [PATCH] Backend: Refine block read/write instruction selection

two comments, thanks.

1.  for header register, we can call:
 const GenRegister header = GenRegister::ud8grf(sel.reg(ir::FAMILY_REG));
instead of:
const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);

2. how about separate the logic for SIMD8 and SIMD16?
two consideration: 
a) In current patch, I see you have finished an elaborate algorithm to handle all the cases. If we separate it, the logic can be simpler, easier to be understood.

b) at SIMD8, help to decrease the reg pressure. For example the following instructions:
[46]            MOV(8)              %66<1>:UD   :       %56<8,8,1>:UD
[48]            MOV(8)              %67<1>:UD   :       %57<8,8,1>:UD
[50]            MOV(8)              %68<1>:UD   :       %58<8,8,1>:UD
[52]            MOV(8)              %69<1>:UD   :       %59<8,8,1>:UD
[54]            OBWRITE(8)          :   %65<8,8,1>:UD   %66<8,8,1>:UD   %67<8,8,1>:UD   %68<8,8,1>:UD   %69<8,8,1>:UD
can be replaced with:
OBWRITE(8)          :   %65<8,8,1>:UD   %56<8,8,1>:UD   %57<8,8,1>:UD   %58<8,8,1>:UD   %59<8,8,1>:UD

due to the reg vector requirement, the previous instructions could not be optimized at sel ir level, and it is feasible for us to generate the optimized ir.

thanks
yejun

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Xiuli Pan
Sent: Friday, December 09, 2016 3:01 PM
To: beignet at lists.freedesktop.org
Cc: Pan, Xiuli
Subject: [Beignet] [PATCH] Backend: Refine block read/write instruction selection

From: Pan Xiuli <xiuli.pan at intel.com>

Move the block pack/unpack into instruction selection in order to get optimization.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen_context.cpp        | 459 ++---------------------------
 backend/src/backend/gen_insn_selection.cpp | 439 ++++++++++++++++++++-------
 2 files changed, 346 insertions(+), 552 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 798fac8..4e971a2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3551,458 +3551,39 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst= ra->genReg(insn.dst(1));
-    const GenRegister addrreg = ra->genReg(insn.src(0));
-    uint32_t type = dst.type;
-    uint32_t typesize = typeSize(type);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
-    const uint32_t simdWidth = p->curr.execWidth;
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-    GenRegister headeraddr;
-    bool isA64 = insn.getbti() == 255;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const GenRegister tmp = ra->genReg(insn.dst(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
     if (isA64)
-      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+       p->OBREADA64(tmp, header, bti, ow_size);
     else
-      headeraddr = GenRegister::offset(header, 0, 2*4);
-
-    // Make header
-    p->push();
-    {
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0, 0));
-
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      p->MOV(headeraddr, addr);
-
-      // Put zero in the general state base address
-      if (!isA64)
-        p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
-
-    }
-    p->pop();
-    // Now read the data, oword block read can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64) {
-          //p->curr.execWidth = 8;
-          p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
-        }
-        else
-          p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-        else
-          p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
-      }
-      p->pop();
-      p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize ));
-    } else if (vec_size == 4) {
-      if (simdWidth == 8) {
-        p->push();
-        {
-          p->curr.execWidth = 16;
-          p->curr.noMask = 1;
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
-        }
-        p->pop();
-        for (uint32_t j = 0; j < 4; j++)
-          p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
-      } else {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 8);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 8 / typesize ; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
-        }
-      }
-    } else if (vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            if (isA64)
-              p->OBREADA64(tmp, header, insn.getbti(), 8);
-            else
-              p->OBREAD(tmp, header, insn.getbti(), 8);
-          }
-          p->pop();
-          for (uint32_t j = 0; j < 16 / typesize; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
-        }
-      } else {
-        for (uint32_t i = 0; i < typesize ; i++) {
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBREADA64(tmp, header, insn.getbti(), 8);
-          else
-            p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
-        }
-      }
-    } else NOT_SUPPORTED;
+       p->OBREAD(tmp, header, bti, ow_size);
   }
 
   void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister addrreg = ra->genReg(insn.src(0));
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    uint32_t type = ra->genReg(insn.src(1)).type;
-    uint32_t typesize = typeSize(type);
-    const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
-    GenRegister headeraddr;
-    bool isA64 = insn.getbti() == 255;
+    const GenRegister header = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t ow_size = insn.extra.elem;
+    bool isA64 = bti == 255;
     if (isA64)
-      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), GEN_TYPE_UL);
+       p->OBWRITEA64(header, bti, ow_size);
     else
-      headeraddr = GenRegister::offset(header, 0, 2*4);
-    const uint32_t simdWidth = p->curr.execWidth;
-    uint32_t tmp_size = simdWidth * vec_size / 8;
-    tmp_size = tmp_size > 4 ? 4 : tmp_size;
-    uint32_t offset_size = isA64 ? 128 : 8;
-
-    p->push();
-      // Copy r0 into the header first
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.noMask = 1;
-      p->MOV(header, GenRegister::ud8grf(0,0));
-
-      // Update the header with the current address
-      p->curr.execWidth = 1;
-      if (isA64)
-        p->MOV(headeraddr, addr);
-      else
-        p->SHR(headeraddr, addr, GenRegister::immud(4));
-
-      // Put zero in the general state base address
-      if (!isA64)
-        p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
-
-    p->pop();
-    // Now write the data, oword block write can only work with simd16 and no mask
-    if (vec_size == 1) {
-      p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
-        else
-          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
-      }
-      p->pop();
-    } else if (vec_size == 2) {
-      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1)));
-      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2)));
-      p->push();
-      {
-        p->curr.execWidth = 16;
-        p->curr.noMask = 1;
-        if (isA64)
-          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
-        else
-          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
-      }
-      p->pop();
-    } else if (vec_size == 4) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < 4; i++)
-          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
-        p->push();
-        {
-          p->curr.execWidth = 16;
-          p->curr.noMask = 1;
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
-          else
-            p->OBWRITE(header, insn.getbti(), 2 * typesize);
-        }
-        p->pop();
-      } else {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 8);
-          else
-            p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else if (vec_size == 8) {
-      if (simdWidth == 8) {
-        for (uint32_t i = 0; i < typesize / 2; i++) {
-          for (uint32_t j = 0; j < 16 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          p->push();
-          {
-            p->curr.execWidth = 16;
-            p->curr.noMask = 1;
-            if (isA64)
-              p->OBWRITEA64(header, insn.getbti(), 8);
-            else
-              p->OBWRITE(header, insn.getbti(), 8);
-          }
-          p->pop();
-        }
-      } else {
-        for (uint32_t i = 0; i < typesize; i++) {
-          for (uint32_t j = 0; j < 8 / typesize; j++)
-            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
-          if (i > 0) {
-            p->push();
-            {
-              // Update the address in header
-              p->curr.execWidth = 1;
-              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
-            }
-            p->pop();
-          }
-          if (isA64)
-            p->OBWRITEA64(header, insn.getbti(), 8);
-          else
-            p->OBWRITE(header, insn.getbti(), 8);
-        }
-      }
-    } else NOT_SUPPORTED;
-
+       p->OBWRITE(header, bti, ow_size);
   }
 
   void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst = ra->genReg(insn.dst(1));
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
-    const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
-    const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
-    size_t vec_size = insn.extra.elem;
-    uint32_t type = dst.type;
-    uint32_t typesize = typeSize(type);
-    uint32_t block_width = typesize * simdWidth;
-    uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // ushort in simd8 will have half reg, but response lenght is still 1
-        uint32_t rsize = vec_size * typesize / 4;
-        rsize = rsize ? rsize : 1;
-        // Now read the data
-        p->MBREAD(dst, header, insn.getbti(), rsize);
-      p->pop();
-
-    }
-    else if (simdWidth == 16)
-    {
-      const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(tmp, header, insn.getbti(), vec_size);
-        for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
-
-        if (typesize == 4)
-        {
-          // Second half
-          // Update the header with the coord
-          p->curr.execWidth = 1;
-          p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-          // Now read the data
-          p->curr.execWidth = 8;
-          p->MBREAD(tmp, header, insn.getbti(), vec_size);
-
-          // Move the reg to fit vector rule.
-          for (uint32_t i = 0; i < vec_size; i++)
-            p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
-                   GenRegister::offset(tmp, i));
-        }
-      p->pop();
-    } else NOT_IMPLEMENTED;
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister header = ra->genReg(insn.src(0));
+    const size_t rsize = insn.extra.elem;
+    p->MBREAD(dst, header, insn.getbti(), rsize);
   }
 
   void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
-    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
-    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
-    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
-    const GenRegister tmp = GenRegister::offset(header, 1);
-    GenRegister offsetx, offsety, blocksizereg;
-    size_t vec_size = insn.extra.elem;
-    uint32_t type = ra->genReg(insn.src(2)).type;
-    uint32_t typesize = typeSize(type);
-    uint32_t block_width = typesize * simdWidth;
-    uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
-
-    offsetx = GenRegister::offset(header, 0, 0*4);
-    offsety = GenRegister::offset(header, 0, 1*4);
-    blocksizereg = GenRegister::offset(header, 0, 2*4);
-
-    if (simdWidth == 8)
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type),
-                 ra->genReg(insn.src(2 + i)));
-        // ushort in simd8 will have half reg, but reponse lenght is still 1
-        uint32_t rsize = vec_size * typesize / 4;
-        rsize = rsize ? rsize : 1;
-        // Now read the data
-        p->MBWRITE(header, insn.getbti(), rsize);
-      p->pop();
-
-    }
-    else
-    {
-      p->push();
-        // Copy r0 into the header first
-        p->curr.execWidth = 8;
-        p->curr.predicate = GEN_PREDICATE_NONE;
-        p->curr.noMask = 1;
-        p->MOV(header, GenRegister::ud8grf(0,0));
-
-        // First half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->MOV(offsetx, coordx);
-        p->MOV(offsety, coordy);
-        // Update block width and height
-        p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
-        p->MBWRITE(header, insn.getbti(), vec_size);
-
-        if (typesize == 4)
-        {
-          // Second half
-          // Update the header with the coord
-          p->curr.execWidth = 1;
-          p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-          p->curr.execWidth = 8;
-          // Mov what we need into msgs
-          for(uint32_t i = 0; i < vec_size; i++)
-            p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
-          // Now write the data
-          p->MBWRITE(header, insn.getbti(), vec_size);
-        }
-
-      p->pop();
-    }
+    const GenRegister header = ra->genReg(insn.dst(0));
+    const size_t msgsize = insn.extra.elem;
+    p->MBWRITE(header, insn.getbti(), msgsize);
   }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7fc22fa..a869986 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -728,13 +728,13 @@ namespace gbe
     void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
                       GenRegister tmpData1, GenRegister tmpData2);
     /*! Oblock read */
-    void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister 
+ header, uint32_t bti, uint32_t ow_size);
     /*! Oblock write */
-    void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
+    void OBWRITE(GenRegister header, GenRegister* values, uint32_t 
+ tmp_size, uint32_t bti, uint32_t ow_size);
     /*! Media block read */
-    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister 
+ header, uint32_t bti, uint32_t block_size);
     /*! Media block write */
-    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    void MBWRITE(GenRegister header, GenRegister* values, uint32_t 
+ tmp_size, uint32_t bti, uint32_t block_size);
 
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI @@ -2236,118 +2236,84 @@ namespace gbe
   }
   void Selection::Opaque::OBREAD(GenRegister* dsts,
                                  uint32_t vec_size,
-                                 GenRegister addr,
                                  GenRegister header,
                                  uint32_t bti,
-                                 GenRegister* tmp,
-                                 uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + vec_size + tmp_size, 1);
+                                 uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 
+ vec_size, 1);
     SelectionVector *vector = this->appendVector();
-    insn->dst(0) = header;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = dsts[i];
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(1 + i + vec_size) = tmp[i];
-    insn->src(0) = addr;
+      insn->dst(i) = dsts[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // number of vector size
+    insn->extra.elem = ow_size; // number of OWord size
 
     // tmp regs for OWORD read dst
-    vector->regNum = tmp_size;
-    vector->reg = &insn->dst(1 + vec_size);
-    vector->offsetID = 1 + vec_size;
+    vector->regNum = vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
     vector->isSrc = 0;
   }
 
-  void Selection::Opaque::OBWRITE(GenRegister addr,
+  void Selection::Opaque::OBWRITE(GenRegister header,
                                   GenRegister* values,
                                   uint32_t vec_size,
-                                  GenRegister header,
                                   uint32_t bti,
-                                  GenRegister* tmp,
-                                  uint32_t tmp_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, tmp_size + 1, vec_size + 1);
+                                  uint32_t ow_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 
+ vec_size + 1);
     SelectionVector *vector = this->appendVector();
-    insn->src(0) = addr;
+    insn->src(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i)
       insn->src(i + 1) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < tmp_size; ++i)
-      insn->dst(i + 1) = tmp[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // number of vector_size
+    insn->extra.elem = ow_size; // number of OWord_size
 
-    // tmp regs for OWORD read dst
-    vector->regNum = tmp_size + 1;
-    vector->reg = &insn->dst(0);
+    // tmp regs for OWORD write header and values
+    vector->regNum = vec_size + 1;
+    vector->reg = &insn->src(0);
     vector->offsetID = 0;
-    vector->isSrc = 0;
+    vector->isSrc = 1;
+
   }
 
   void Selection::Opaque::MBREAD(GenRegister* dsts,
-                                 GenRegister coordx,
-                                 GenRegister coordy,
+                                 uint32_t tmp_size,
                                  GenRegister header,
-                                 GenRegister* tmp,
                                  uint32_t bti,
-                                 uint32_t vec_size) {
-
-    uint32_t simdWidth = curr.execWidth;
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i) {
-      insn->dst(i + 1) = dsts[i];
-      if(simdWidth == 16)
-        insn->dst(i + vec_size + 1) = tmp[i];
-    }
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
+                                 uint32_t block_size) {
+
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, tmp_size, 1);
+    insn->src(0) = header;
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = block_size; // vector size
 
-    // Only in simd 8 the data is in vector form
-    if(simdWidth == 8) {
-      SelectionVector *vector = this->appendVector();
-      vector->regNum = vec_size;
-      vector->reg = &insn->dst(1);
-      vector->offsetID = 1;
-      vector->isSrc = 0;
-    }
-    if(simdWidth == 16)
-    {
-      SelectionVector *vectortmp = this->appendVector();
-      vectortmp->regNum = vec_size;
-      vectortmp->reg = &insn->dst(vec_size + 1);
-      vectortmp->offsetID = vec_size + 1;
-      vectortmp->isSrc = 0;
+    for (uint32_t i = 0; i < tmp_size; ++i) {
+      insn->dst(i) = dsts[i];
     }
+    SelectionVector *vector = this->appendVector();
+    vector->regNum = tmp_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
   }
 
-  void Selection::Opaque::MBWRITE(GenRegister coordx,
-                                  GenRegister coordy,
+  void Selection::Opaque::MBWRITE(GenRegister header,
                                   GenRegister* values,
-                                  GenRegister header,
-                                  GenRegister* tmp,
+                                  uint32_t tmp_size,
                                   uint32_t bti,
-                                  uint32_t vec_size) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+                                  uint32_t block_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1
+ + tmp_size);
     SelectionVector *vector = this->appendVector();
-    insn->src(0) = coordx;
-    insn->src(1) = coordy;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->src(2 + i) = values[i];
-    insn->dst(0) = header;
-    for (uint32_t i = 0; i < vec_size; ++i)
-      insn->dst(1 + i) = tmp[i];
-    insn->state = this->curr;
+    insn->src(0) = header;
+    for (uint32_t i = 0; i < tmp_size; ++i)
+      insn->src(1 + i) = values[i];
     insn->setbti(bti);
-    insn->extra.elem = vec_size; // vector size
+    insn->extra.elem = block_size; // msg size
 
     // We need to put the header and the data together
-    vector->regNum = 1 + vec_size;
+    vector->regNum = 1 + tmp_size;
     vector->reg = &insn->dst(0);
     vector->offsetID = 0;
-    vector->isSrc = 0;
+    vector->isSrc = 1;
   }
 
   // Boiler plate to initialize the selection library at c++ pre-main @@ -4683,18 +4649,79 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Type type = insn.getValueType();
       const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+
+      const GenRegister header =
+ GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)),
+ GEN_TYPE_UD);
       vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
         valuesVec.push_back(sel.selReg(insn.getValue(i), type));
-      // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
-      tmp_size = tmp_size == 0 ? 1 : tmp_size;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
-      vector<GenRegister> tmpVec;
+
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+
+      /* For block read we need to unpack the block date into values, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block read send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
+            }
+            sel.pop();
+          }
+          sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
+      }
+
     }
 
     // check whether all binded table index point to constant memory @@ -5129,18 +5156,80 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       const Type type = insn.getValueType();
       const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      bool isA64 = SI == 255;
+
+      const GenRegister header =
+ GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)),
+ GEN_TYPE_UD);
       vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
         valuesVec.push_back(sel.selReg(insn.getValue(i), type));
-      // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
-      tmp_size = tmp_size == 0 ? 1 : tmp_size;
-      tmp_size = tmp_size > 4 ? 4 : tmp_size;
-      vector<GenRegister> tmpVec;
+
+      GenRegister headeraddr;
+      if (isA64)
+        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), GEN_TYPE_UL);
+      else
+        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
+      // Make header
+      sel.push();
+      {
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the current address
+        sel.curr.execWidth = 1;
+
+        // Put zero in the general state base address
+        if (isA64)
+          sel.MOV(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UL));
+        else {
+          sel.SHR(headeraddr, GenRegister::toUniform(address, GEN_TYPE_UD), GenRegister::immud(4));
+          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
+        }
+      }
+      sel.pop();
+
+      /* For block write we need to pack the block date into the tmp, and for different
+       * simdwidth and vector size with different type size, we may need to spilt the
+       * block write send message.
+       * We can only get a send message with 5 reg length
+       * so for different combination we have different message length and tmp vector size
+       *              |  simd8  | simd16 |  simd8 | simd16
+       *  r0  |header |         |        |        |
+       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
+       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
+       *  r3  |date   | ......  | ...... | ...... |  dw1
+       *  r4  |date   | ....... | ...... | ...... |  dw1
+       */
+
+      uint32_t totalSize = simdWidth * typeSize * vec_size;
+      uint32_t valueSize = simdWidth * typeSize;
+      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
+      uint32_t msg_num = vec_size / tmp_size;
+      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
+
       for(uint32_t i = 0; i < tmp_size; i++)
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], tmp_size);
+        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+      for (uint32_t i = 0; i < msg_num; i++) {
+          for (uint32_t j = 0; j < tmp_size; j++)
+            sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
+          if (i > 0) {
+            sel.push();
+            {
+              // Update the address in header
+              sel.curr.execWidth = 1;
+              sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            sel.pop();
+          }
+          sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
+      }
+
+
     }
 
     virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const @@ -7616,20 +7705,77 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint32_t vec_size = insn.getVectorSize();
       uint32_t simdWidth = sel.curr.execWidth;
       const Type type = insn.getType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      // ushort in simd8 will have half reg, but response lenght is still 1
+      uint32_t rsize = simdWidth * vec_size * typeSize / 32;
+      rsize = rsize ? rsize : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 
+ 16;
+
+
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for (uint32_t i = 0; i < vec_size; ++i) {
         valuesVec.push_back(sel.selReg(insn.getDst(i), type));
-        if(simdWidth == 16)
-          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
+        if(simdWidth == 16 && typeSize == 4)
+          
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
+ _REG)), GEN_TYPE_UD));
       }
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      GenRegister *tmp = NULL;
-      if(simdWidth == 16)
-        tmp = &tmpVec[0];
-      sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 
+ 4);
+
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+
+      if (simdWidth * typeSize < 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), rsize);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), vec_size);
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(valuesVec[i], tmpVec[i]);
+
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          sel.curr.execWidth = 8;
+          sel.MBREAD(&tmpVec[0], vec_size, header, 
+ insn.getImageIndex(), vec_size);
+
+          // Move the reg to fit vector rule.
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
+        sel.pop();
+      } else NOT_IMPLEMENTED;
+
+
       return true;
     }
     DECL_CTOR(MediaBlockReadInstruction, 1, 1); @@ -7643,17 +7789,84 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
       const Type type = insn.getType();
-      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
-      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      uint32_t simdWidth = sel.curr.execWidth;
+      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
+      const RegisterFamily family = getFamily(type);
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      // ushort in simd8 will have half reg, but response lenght is still 1
+      uint32_t msgsize = simdWidth * vec_size * typeSize / 32;
+      msgsize = msgsize ? msgsize : 1;
+      uint32_t block_width = typeSize * simdWidth;
+      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 
+ 16;
+
+
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
-      for(uint32_t i = 0; i < vec_size; i++)
-      {
-        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
-        tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
-      }
-      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
-      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+      for (uint32_t i = 0; i < vec_size; ++i) {
+         valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+        if(simdWidth == 16 && typeSize == 4)
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD));
+        else
+          tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), genType));
+       }
+      const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister coordy = GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), GEN_TYPE_UD);
+      const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_REG)), GEN_TYPE_UD);
+      const GenRegister offsetx = GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
+      const GenRegister offsety = GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
+      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 
+ 2*4);
+
+      // Make header
+      sel.push();
+        // Copy r0 into the header first
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.MOV(header, GenRegister::ud8grf(0, 0));
+
+        // Update the header with the coord
+        sel.curr.execWidth = 1;
+        sel.MOV(offsetx, coordx);
+        sel.MOV(offsety, coordy);
+        // Update block width and height
+        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
+      sel.pop();
+
+      if (simdWidth * typeSize < 64) {
+        for (uint32_t i = 0; i < vec_size; ++i) {
+            sel.MOV(tmpVec[i], valuesVec[i]);
+        }
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          // Now read the data
+          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), msgsize);
+        sel.pop();
+      } else if (simdWidth * typeSize == 64) {
+        sel.push();
+          sel.curr.execWidth = 8;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], valuesVec[i]);
+          sel.MBWRITE(header, &tmpVec[0], vec_size, 
+ insn.getImageIndex(), vec_size);
+
+          // Second half
+          // Update the header with the coord
+          sel.curr.execWidth = 1;
+          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          sel.curr.execWidth = 8;
+          for (uint32_t i = 0; i < vec_size; i++)
+            sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
+          sel.MBWRITE(header, &tmpVec[0], vec_size, 
+ insn.getImageIndex(), vec_size);
+
+          // Move the reg to fit vector rule.
+        sel.pop();
+      } else NOT_IMPLEMENTED;
+
       return true;
     }
     DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
--
2.7.4

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list