[Beignet] [PATCH V4] Backend: Refine block read/write instruction selection

Yang, Rong R rong.r.yang at intel.com
Mon Dec 26 03:36:40 UTC 2016


Pushed, thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Guo, Yejun
> Sent: Monday, December 19, 2016 16:43
> To: Pan, Xiuli <xiuli.pan at intel.com>; beignet at lists.freedesktop.org
> Cc: Pan, Xiuli <xiuli.pan at intel.com>
> Subject: Re: [Beignet] [PATCH V4] Backend: Refine block read/write
> instruction selection
> 
> LGTM, thanks.
> 
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Monday, December 19, 2016 3:58 PM
> To: beignet at lists.freedesktop.org
> Cc: Pan, Xiuli
> Subject: [Beignet] [PATCH V4] Backend: Refine block read/write instruction
> selection
> 
> From: Pan Xiuli <xiuli.pan at intel.com>
> 
> Move the block pack/unpack into instruction selection in order to get
> optimization. Also change some variable name to avoid misleading.
> And make some new function in GenEncoder class.
> V2: Use ud8grf instead of f8grf to save a retype.
> V3: Merge change name patch and fix some comments.
> V4: Fix some simd 8 related bug and comments typo.
> 
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
>  backend/src/backend/gen8_encoder.cpp       |  40 ++-
>  backend/src/backend/gen_context.cpp        | 459 ++---------------------------
>  backend/src/backend/gen_encoder.cpp        | 105 ++++---
>  backend/src/backend/gen_encoder.hpp        |  18 +-
>  backend/src/backend/gen_insn_selection.cpp | 448
> +++++++++++++++++++++-------
>  5 files changed, 440 insertions(+), 630 deletions(-)
> 
> diff --git a/backend/src/backend/gen8_encoder.cpp
> b/backend/src/backend/gen8_encoder.cpp
> index 8f73346..39dcfd3 100644
> --- a/backend/src/backend/gen8_encoder.cpp
> +++ b/backend/src/backend/gen8_encoder.cpp
> @@ -840,20 +840,15 @@ namespace gbe
>      gen8_insn->bits3.gen8_block_rw_a64.header_present = 1;
>    }
> 
> -  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t size) {
> -   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +  void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t ow_size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      const uint32_t msg_length = 1;
> -    uint32_t rsize = size / 2;
> -    uint32_t msgsize = size;
> -    // When size is 1 OWord, which means half a reg, we need to know which
> half to use
> -    if (size == 1) {
> -      if (dst.subnr == 0)
> -        msgsize = 0;
> -      else
> -        msgsize = 1;
> -    }
> -    rsize = rsize == 0 ? 1 : rsize;
> -    const uint32_t response_length = rsize; // Size is in regs
> +    uint32_t sizeinreg = ow_size / 2;
> +    // half reg should also have size 1
> +    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> +    const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
> +    const uint32_t response_length = sizeinreg; // Size is in reg
> +
>      this->setHeader(insn);
>      this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
>      this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> @@ -861,21 +856,22 @@ namespace gbe
>      setOBlockRWA64(this,
>                     insn,
>                     bti,
> -                   msgsize,
> +                   block_size,
>                     GEN8_P1_BLOCK_READ_A64,
>                     msg_length,
>                     response_length);
> 
>    }
> 
> -  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti,
> uint32_t size) {
> -   GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> -    uint32_t rsize = size / 2;
> -    rsize = rsize == 0 ? 1 : rsize;
> -    const uint32_t msg_length = 1 + rsize; // Size is in owords
> +  void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti,
> uint32_t ow_size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    uint32_t sizeinreg = ow_size / 2;
> +    // half reg should also have size 1
> +    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> +    const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
>      const uint32_t response_length = 0;
> -    uint32_t msgsize = size;
> -    msgsize = msgsize == 1 ? 0 : msgsize;
> +    const uint32_t block_size = getOBlockSize(ow_size);
> +
>      this->setHeader(insn);
>      this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
>      this->setSrc1(insn, GenRegister::immud(0));
> @@ -883,7 +879,7 @@ namespace gbe
>      setOBlockRWA64(this,
>                     insn,
>                     bti,
> -                   msgsize,
> +                   block_size,
>                     GEN8_P1_BLOCK_WRITE_A64,
>                     msg_length,
>                     response_length);
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 8288fa5..791e607 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3551,458 +3551,39 @@ namespace gbe
>    }
> 
>    void GenContext::emitOBReadInstruction(const SelectionInstruction &insn)
> {
> -    const GenRegister dst= ra->genReg(insn.dst(1));
> -    const GenRegister addrreg = ra->genReg(insn.src(0));
> -    uint32_t type = dst.type;
> -    uint32_t typesize = typeSize(type);
> -    const uint32_t vec_size = insn.extra.elem;
> -    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 +
> vec_size)), type);
> -    const uint32_t simdWidth = p->curr.execWidth;
> -    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> -    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
> -    GenRegister headeraddr;
> -    bool isA64 = insn.getbti() == 255;
> +    const GenRegister header = ra->genReg(insn.src(0));
> +    const GenRegister tmp = ra->genReg(insn.dst(0));
> +    const uint32_t bti = insn.getbti();
> +    const uint32_t ow_size = insn.extra.elem;
> +    bool isA64 = bti == 255;
>      if (isA64)
> -      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0),
> GEN_TYPE_UL);
> +       p->OBREADA64(tmp, header, bti, ow_size);
>      else
> -      headeraddr = GenRegister::offset(header, 0, 2*4);
> -
> -    // Make header
> -    p->push();
> -    {
> -      // Copy r0 into the header first
> -      p->curr.execWidth = 8;
> -      p->curr.predicate = GEN_PREDICATE_NONE;
> -      p->curr.noMask = 1;
> -      p->MOV(header, GenRegister::ud8grf(0, 0));
> -
> -      // Update the header with the current address
> -      p->curr.execWidth = 1;
> -      p->MOV(headeraddr, addr);
> -
> -      // Put zero in the general state base address
> -      if (!isA64)
> -        p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0));
> -
> -    }
> -    p->pop();
> -    // Now read the data, oword block read can only work with simd16 and no
> mask
> -    if (vec_size == 1) {
> -      p->push();
> -      {
> -        p->curr.execWidth = 16;
> -        p->curr.noMask = 1;
> -        if (isA64) {
> -          //p->curr.execWidth = 8;
> -          p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / 16);
> -        }
> -        else
> -          p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
> -      }
> -      p->pop();
> -    } else if (vec_size == 2) {
> -      p->push();
> -      {
> -        p->curr.execWidth = 16;
> -        p->curr.noMask = 1;
> -        if (isA64)
> -          p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8);
> -        else
> -          p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
> -      }
> -      p->pop();
> -      p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
> -      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0,
> simdWidth * typesize ));
> -    } else if (vec_size == 4) {
> -      if (simdWidth == 8) {
> -        p->push();
> -        {
> -          p->curr.execWidth = 16;
> -          p->curr.noMask = 1;
> -          if (isA64)
> -            p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize);
> -          else
> -            p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
> -        }
> -        p->pop();
> -        for (uint32_t j = 0; j < 4; j++)
> -          p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j *
> simdWidth * typesize ));
> -      } else {
> -        for (uint32_t i = 0; i < typesize / 2; i++) {
> -          if (i > 0) {
> -            p->push();
> -            {
> -              // Update the address in header
> -              p->curr.execWidth = 1;
> -              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
> -            }
> -            p->pop();
> -          }
> -          if (isA64)
> -            p->OBREADA64(tmp, header, insn.getbti(), 8);
> -          else
> -            p->OBREAD(tmp, header, insn.getbti(), 8);
> -          for (uint32_t j = 0; j < 8 / typesize ; j++)
> -            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp,
> 0 ,j * simdWidth * typesize ));
> -        }
> -      }
> -    } else if (vec_size == 8) {
> -      if (simdWidth == 8) {
> -        for (uint32_t i = 0; i < typesize / 2; i++) {
> -          if (i > 0) {
> -            p->push();
> -            {
> -              // Update the address in header
> -              p->curr.execWidth = 1;
> -              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
> -            }
> -            p->pop();
> -          }
> -          p->push();
> -          {
> -            p->curr.execWidth = 16;
> -            p->curr.noMask = 1;
> -            if (isA64)
> -              p->OBREADA64(tmp, header, insn.getbti(), 8);
> -            else
> -              p->OBREAD(tmp, header, insn.getbti(), 8);
> -          }
> -          p->pop();
> -          for (uint32_t j = 0; j < 16 / typesize; j++)
> -            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp,
> 0, j * simdWidth * typesize ));
> -        }
> -      } else {
> -        for (uint32_t i = 0; i < typesize ; i++) {
> -          if (i > 0) {
> -            p->push();
> -            {
> -              // Update the address in header
> -              p->curr.execWidth = 1;
> -              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
> -            }
> -            p->pop();
> -          }
> -          if (isA64)
> -            p->OBREADA64(tmp, header, insn.getbti(), 8);
> -          else
> -            p->OBREAD(tmp, header, insn.getbti(), 8);
> -          for (uint32_t j = 0; j < 8 / typesize; j++)
> -            p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)),
> GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
> -        }
> -      }
> -    } else NOT_SUPPORTED;
> +       p->OBREAD(tmp, header, bti, ow_size);
>    }
> 
>    void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn)
> {
> -    const GenRegister addrreg = ra->genReg(insn.src(0));
> -    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> -    uint32_t type = ra->genReg(insn.src(1)).type;
> -    uint32_t typesize = typeSize(type);
> -    const uint32_t vec_size = insn.extra.elem;
> -    const GenRegister tmp = GenRegister::offset(header, 1);
> -    const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type);
> -    GenRegister headeraddr;
> -    bool isA64 = insn.getbti() == 255;
> +    const GenRegister header = ra->genReg(insn.src(0));
> +    const uint32_t bti = insn.getbti();
> +    const uint32_t ow_size = insn.extra.elem;
> +    bool isA64 = bti == 255;
>      if (isA64)
> -      headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0),
> GEN_TYPE_UL);
> +       p->OBWRITEA64(header, bti, ow_size);
>      else
> -      headeraddr = GenRegister::offset(header, 0, 2*4);
> -    const uint32_t simdWidth = p->curr.execWidth;
> -    uint32_t tmp_size = simdWidth * vec_size / 8;
> -    tmp_size = tmp_size > 4 ? 4 : tmp_size;
> -    uint32_t offset_size = isA64 ? 128 : 8;
> -
> -    p->push();
> -      // Copy r0 into the header first
> -      p->curr.execWidth = 8;
> -      p->curr.predicate = GEN_PREDICATE_NONE;
> -      p->curr.noMask = 1;
> -      p->MOV(header, GenRegister::ud8grf(0,0));
> -
> -      // Update the header with the current address
> -      p->curr.execWidth = 1;
> -      if (isA64)
> -        p->MOV(headeraddr, addr);
> -      else
> -        p->SHR(headeraddr, addr, GenRegister::immud(4));
> -
> -      // Put zero in the general state base address
> -      if (!isA64)
> -        p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0));
> -
> -    p->pop();
> -    // Now write the data, oword block write can only work with simd16 and
> no mask
> -    if (vec_size == 1) {
> -      p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
> -      p->push();
> -      {
> -        p->curr.execWidth = 16;
> -        p->curr.noMask = 1;
> -        if (isA64)
> -          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16);
> -        else
> -          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
> -      }
> -      p->pop();
> -    } else if (vec_size == 2) {
> -      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra-
> >genReg(insn.src(1)));
> -      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth *
> typesize), type), ra->genReg(insn.src(2)));
> -      p->push();
> -      {
> -        p->curr.execWidth = 16;
> -        p->curr.noMask = 1;
> -        if (isA64)
> -          p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8);
> -        else
> -          p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
> -      }
> -      p->pop();
> -    } else if (vec_size == 4) {
> -      if (simdWidth == 8) {
> -        for (uint32_t i = 0; i < 4; i++)
> -          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
> -        p->push();
> -        {
> -          p->curr.execWidth = 16;
> -          p->curr.noMask = 1;
> -          if (isA64)
> -            p->OBWRITEA64(header, insn.getbti(), 2 * typesize);
> -          else
> -            p->OBWRITE(header, insn.getbti(), 2 * typesize);
> -        }
> -        p->pop();
> -      } else {
> -        for (uint32_t i = 0; i < typesize / 2; i++) {
> -          for (uint32_t j = 0; j < 8 / typesize; j++)
> -            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
> -          if (i > 0) {
> -            p->push();
> -            {
> -              // Update the address in header
> -              p->curr.execWidth = 1;
> -              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
> -            }
> -            p->pop();
> -          }
> -          if (isA64)
> -            p->OBWRITEA64(header, insn.getbti(), 8);
> -          else
> -            p->OBWRITE(header, insn.getbti(), 8);
> -        }
> -      }
> -    } else if (vec_size == 8) {
> -      if (simdWidth == 8) {
> -        for (uint32_t i = 0; i < typesize / 2; i++) {
> -          for (uint32_t j = 0; j < 16 / typesize; j++)
> -            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
> -          if (i > 0) {
> -            p->push();
> -            {
> -              // Update the address in header
> -              p->curr.execWidth = 1;
> -              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
> -            }
> -            p->pop();
> -          }
> -          p->push();
> -          {
> -            p->curr.execWidth = 16;
> -            p->curr.noMask = 1;
> -            if (isA64)
> -              p->OBWRITEA64(header, insn.getbti(), 8);
> -            else
> -              p->OBWRITE(header, insn.getbti(), 8);
> -          }
> -          p->pop();
> -        }
> -      } else {
> -        for (uint32_t i = 0; i < typesize; i++) {
> -          for (uint32_t j = 0; j < 8 / typesize; j++)
> -            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j *
> simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
> -          if (i > 0) {
> -            p->push();
> -            {
> -              // Update the address in header
> -              p->curr.execWidth = 1;
> -              p->ADD(headeraddr, headeraddr, GenRegister::immud(offset_size));
> -            }
> -            p->pop();
> -          }
> -          if (isA64)
> -            p->OBWRITEA64(header, insn.getbti(), 8);
> -          else
> -            p->OBWRITE(header, insn.getbti(), 8);
> -        }
> -      }
> -    } else NOT_SUPPORTED;
> -
> +       p->OBWRITE(header, bti, ow_size);
>    }
> 
>    void GenContext::emitMBReadInstruction(const SelectionInstruction &insn)
> {
> -    const GenRegister dst = ra->genReg(insn.dst(1));
> -    const GenRegister coordx = GenRegister::toUniform(ra-
> >genReg(insn.src(0)),GEN_TYPE_D);
> -    const GenRegister coordy = GenRegister::toUniform(ra-
> >genReg(insn.src(1)),GEN_TYPE_D);
> -    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> -    const GenRegister offsetx = GenRegister::offset(header, 0, 0*4);
> -    const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
> -    const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
> -    size_t vec_size = insn.extra.elem;
> -    uint32_t type = dst.type;
> -    uint32_t typesize = typeSize(type);
> -    uint32_t block_width = typesize * simdWidth;
> -    uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
> -
> -    if (simdWidth == 8)
> -    {
> -      p->push();
> -        // Copy r0 into the header first
> -        p->curr.execWidth = 8;
> -        p->curr.predicate = GEN_PREDICATE_NONE;
> -        p->curr.noMask = 1;
> -        p->MOV(header, GenRegister::ud8grf(0,0));
> -
> -        // Update the header with the coord
> -        p->curr.execWidth = 1;
> -        p->MOV(offsetx, coordx);
> -        p->MOV(offsety, coordy);
> -        // Update block width and height
> -        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> -        p->curr.execWidth = 8;
> -        // ushort in simd8 will have half reg, but response lenght is still 1
> -        uint32_t rsize = vec_size * typesize / 4;
> -        rsize = rsize ? rsize : 1;
> -        // Now read the data
> -        p->MBREAD(dst, header, insn.getbti(), rsize);
> -      p->pop();
> -
> -    }
> -    else if (simdWidth == 16)
> -    {
> -      const GenRegister tmp = GenRegister::retype(ra-
> >genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD);
> -      p->push();
> -        // Copy r0 into the header first
> -        p->curr.execWidth = 8;
> -        p->curr.predicate = GEN_PREDICATE_NONE;
> -        p->curr.noMask = 1;
> -        p->MOV(header, GenRegister::ud8grf(0,0));
> -
> -        // First half
> -        // Update the header with the coord
> -        p->curr.execWidth = 1;
> -        p->MOV(offsetx, coordx);
> -        p->MOV(offsety, coordy);
> -        // Update block width and height
> -        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> -        // Now read the data
> -        p->curr.execWidth = 8;
> -        p->MBREAD(tmp, header, insn.getbti(), vec_size);
> -        for (uint32_t i = 0; i < vec_size; i++)
> -          p->MOV(GenRegister::retype(ra->genReg(insn.dst(i +
> 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
> -
> -        if (typesize == 4)
> -        {
> -          // Second half
> -          // Update the header with the coord
> -          p->curr.execWidth = 1;
> -          p->ADD(offsetx, offsetx, GenRegister::immud(32));
> -
> -          // Now read the data
> -          p->curr.execWidth = 8;
> -          p->MBREAD(tmp, header, insn.getbti(), vec_size);
> -
> -          // Move the reg to fit vector rule.
> -          for (uint32_t i = 0; i < vec_size; i++)
> -            p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
> -                   GenRegister::offset(tmp, i));
> -        }
> -      p->pop();
> -    } else NOT_IMPLEMENTED;
> +    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const GenRegister header = ra->genReg(insn.src(0));
> +    const size_t response_size = insn.extra.elem;
> +    p->MBREAD(dst, header, insn.getbti(), response_size);
>    }
> 
>    void GenContext::emitMBWriteInstruction(const SelectionInstruction
> &insn) {
> -    const GenRegister coordx = GenRegister::toUniform(ra-
> >genReg(insn.src(0)), GEN_TYPE_D);
> -    const GenRegister coordy = GenRegister::toUniform(ra-
> >genReg(insn.src(1)), GEN_TYPE_D);
> -    const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)),
> GEN_TYPE_UD);
> -    const GenRegister tmp = GenRegister::offset(header, 1);
> -    GenRegister offsetx, offsety, blocksizereg;
> -    size_t vec_size = insn.extra.elem;
> -    uint32_t type = ra->genReg(insn.src(2)).type;
> -    uint32_t typesize = typeSize(type);
> -    uint32_t block_width = typesize * simdWidth;
> -    uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
> -
> -    offsetx = GenRegister::offset(header, 0, 0*4);
> -    offsety = GenRegister::offset(header, 0, 1*4);
> -    blocksizereg = GenRegister::offset(header, 0, 2*4);
> -
> -    if (simdWidth == 8)
> -    {
> -      p->push();
> -        // Copy r0 into the header first
> -        p->curr.execWidth = 8;
> -        p->curr.predicate = GEN_PREDICATE_NONE;
> -        p->curr.noMask = 1;
> -        p->MOV(header, GenRegister::ud8grf(0,0));
> -
> -        // Update the header with the coord
> -        p->curr.execWidth = 1;
> -        p->MOV(offsetx, coordx);
> -        p->MOV(offsety, coordy);
> -        // Update block width and height
> -        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> -        p->curr.execWidth = 8;
> -        // Mov what we need into msgs
> -        for(uint32_t i = 0; i < vec_size; i++)
> -          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i *
> block_width), type),
> -                 ra->genReg(insn.src(2 + i)));
> -        // ushort in simd8 will have half reg, but reponse lenght is still 1
> -        uint32_t rsize = vec_size * typesize / 4;
> -        rsize = rsize ? rsize : 1;
> -        // Now read the data
> -        p->MBWRITE(header, insn.getbti(), rsize);
> -      p->pop();
> -
> -    }
> -    else
> -    {
> -      p->push();
> -        // Copy r0 into the header first
> -        p->curr.execWidth = 8;
> -        p->curr.predicate = GEN_PREDICATE_NONE;
> -        p->curr.noMask = 1;
> -        p->MOV(header, GenRegister::ud8grf(0,0));
> -
> -        // First half
> -        // Update the header with the coord
> -        p->curr.execWidth = 1;
> -        p->MOV(offsetx, coordx);
> -        p->MOV(offsety, coordy);
> -        // Update block width and height
> -        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> -        // Now read the data
> -        p->curr.execWidth = 8;
> -        // Mov what we need into msgs
> -        for(uint32_t i = 0; i < vec_size; i++)
> -          p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra-
> >genReg(insn.src(2 + i)), GEN_TYPE_UD));
> -        p->MBWRITE(header, insn.getbti(), vec_size);
> -
> -        if (typesize == 4)
> -        {
> -          // Second half
> -          // Update the header with the coord
> -          p->curr.execWidth = 1;
> -          p->ADD(offsetx, offsetx, GenRegister::immud(32));
> -
> -          p->curr.execWidth = 8;
> -          // Mov what we need into msgs
> -          for(uint32_t i = 0; i < vec_size; i++)
> -            p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra-
> >genReg(insn.src(2 + i)), 1));
> -          // Now write the data
> -          p->MBWRITE(header, insn.getbti(), vec_size);
> -        }
> -
> -      p->pop();
> -    }
> +    const GenRegister header = ra->genReg(insn.dst(0));
> +    const size_t data_size = insn.extra.elem;
> +    p->MBWRITE(header, insn.getbti(), data_size);
>    }
> 
>    BVAR(OCL_OUTPUT_REG_ALLOC, false);
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index 49d93e8..1bca668 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -257,32 +257,47 @@ namespace gbe
>        NOT_SUPPORTED;
>    }
> 
> -  static void setOBlockRW(GenEncoder *p,
> -                          GenNativeInstruction *insn,
> -                          uint32_t bti,
> -                          uint32_t size,
> -                          uint32_t msg_type,
> -                          uint32_t msg_length,
> -                          uint32_t response_length)
> +  void GenEncoder::setOBlockRW(GenNativeInstruction *insn,
> +                               uint32_t bti,
> +                               uint32_t block_size,
> +                               uint32_t msg_type,
> +                               uint32_t msg_length,
> +                               uint32_t response_length)
>    {
>      const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
> -    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> -    assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
> +    setMessageDescriptor(insn, sfid, msg_length, response_length);
>      insn->bits3.gen7_oblock_rw.msg_type = msg_type;
>      insn->bits3.gen7_oblock_rw.bti = bti;
> -    insn->bits3.gen7_oblock_rw.block_size = size <=  2 ? size : (size == 4 ? 3 :
> 4);
> +    insn->bits3.gen7_oblock_rw.block_size = block_size;
>      insn->bits3.gen7_oblock_rw.header_present = 1;
>    }
> 
> -  static void setMBlockRW(GenEncoder *p,
> -                          GenNativeInstruction *insn,
> -                          uint32_t bti,
> -                          uint32_t msg_type,
> -                          uint32_t msg_length,
> -                          uint32_t response_length)
> +  uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half)
> +  {
> +    /* 000: 1 OWord, read into or written from the low 128 bits of the
> destination register.
> +     * 001: 1 OWord, read into or written from the high 128 bits of the
> destination register.
> +     * 010: 2 OWords
> +     * 011: 4 OWords
> +     * 100: 8 OWords */
> +    switch(oword_size)
> +    {
> +      case 1: return low_half ? 0 : 1;
> +      case 2: return 2;
> +      case 4: return 3;
> +      case 8: return 4;
> +      default: NOT_SUPPORTED;
> +    }
> +    return 0;
> +  }
> +
> +  void GenEncoder::setMBlockRW(GenNativeInstruction *insn,
> +                               uint32_t bti,
> +                               uint32_t msg_type,
> +                               uint32_t msg_length,
> +                               uint32_t response_length)
>    {
>      const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
> -    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> +    setMessageDescriptor(insn, sfid, msg_length, response_length);
>      insn->bits3.gen7_mblock_rw.msg_type = msg_type;
>      insn->bits3.gen7_mblock_rw.bti = bti;
>      insn->bits3.gen7_mblock_rw.header_present = 1;
> @@ -1312,80 +1327,72 @@ namespace gbe
>       setScratchMessage(this, insn, offset, block_size, channel_mode,
> GEN_SCRATCH_READ, 1, dst_num);
>    }
> 
> -  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> +  void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t ow_size) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      const uint32_t msg_length = 1;
> -    uint32_t rsize = size / 2;
> -    uint32_t msgsize = size;
> -    // When size is 1 OWord, which means half a reg, we need to know which
> half to use
> -    if (size == 1) {
> -      if (dst.subnr == 0)
> -        msgsize = 0;
> -      else
> -        msgsize = 1;
> -    }
> -    rsize = rsize == 0 ? 1 : rsize;
> -    const uint32_t response_length = rsize; // Size is in regs
> +    uint32_t sizeinreg = ow_size / 2;
> +    // half reg should also have size 1
> +    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> +    const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0);
> +    const uint32_t response_length = sizeinreg; // Size is in reg
> +
>      this->setHeader(insn);
>      this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
>      this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
>      this->setSrc1(insn, GenRegister::immud(0));
> -    setOBlockRW(this,
> -                insn,
> +    setOBlockRW(insn,
>                  bti,
> -                msgsize,
> +                block_size,
>                  GEN7_UNALIGNED_OBLOCK_READ,
>                  msg_length,
>                  response_length);
>    }
> 
> -  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size)
> {
> +  void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t
> ow_size) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> -    uint32_t rsize = size / 2;
> -    rsize = rsize == 0 ? 1 : rsize;
> -    const uint32_t msg_length = 1 + rsize; // Size is in owords
> +    uint32_t sizeinreg = ow_size / 2;
> +    // half reg should also have size 1
> +    sizeinreg = sizeinreg == 0 ? 1 : sizeinreg;
> +    const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header
>      const uint32_t response_length = 0;
> -    uint32_t msgsize = size;
> -    msgsize = msgsize == 1 ? 0 : msgsize;
> +    const uint32_t block_size = getOBlockSize(ow_size);
> +
>      this->setHeader(insn);
>      this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
>      this->setSrc1(insn, GenRegister::immud(0));
>      this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> -    setOBlockRW(this,
> -                insn,
> +    setOBlockRW(insn,
>                  bti,
> -                msgsize,
> +                block_size,
>                  GEN7_OBLOCK_WRITE,
>                  msg_length,
>                  response_length);
>    }
> 
> -  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t size) {
> +  void GenEncoder::MBREAD(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t response_size) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      const uint32_t msg_length = 1;
> -    const uint32_t response_length = size; // Size of registers
> +    const uint32_t response_length = response_size; // Size of registers
>      this->setHeader(insn);
>      this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
>      this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
>      this->setSrc1(insn, GenRegister::immud(0));
> -    setMBlockRW(this,
> -                insn,
> +    setMBlockRW(insn,
>                  bti,
>                  GEN75_P1_MEDIA_BREAD,
>                  msg_length,
>                  response_length);
>    }
> 
> -  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t
> size) {
> +  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t
> data_size) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> -    const uint32_t msg_length = 1 + size;
> +    const uint32_t msg_length = 1 + data_size;
>      const uint32_t response_length = 0; // Size of registers
>      this->setHeader(insn);
>      this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
>      this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
>      this->setSrc1(insn, GenRegister::immud(0));
> -    setMBlockRW(this,
> -                insn,
> +    setMBlockRW(insn,
>                  bti,
>                  GEN75_P1_MEDIA_TYPED_BWRITE,
>                  msg_length,
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index e5eb2e2..46ec53b 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -286,18 +286,24 @@ namespace gbe
>      virtual bool canHandleLong(uint32_t opcode, GenRegister dst,
> GenRegister src0,
>                              GenRegister src1 = GenRegister::null());
>      virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister
> dst, GenRegister src0, GenRegister src1 = GenRegister::null());
> +
> +    /*! OBlock helper function */
> +    uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true);
> +    void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t
> msg_type, uint32_t msg_length, uint32_t response_length);
> +    void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t
> block_size, uint32_t msg_type, uint32_t msg_length, uint32_t
> response_lengtha);
> +
>      /*! OBlock read */
> -    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t
> elemSize);
> +    void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t
> ow_size);
>      /*! OBlock write */
> -    void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
> +    void OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size);
>      /*! MBlock read */
> -    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
> +    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t response_size);
>      /*! MBlock write */
> -    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t
> elemSize);
> +    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t
> data_size);
>      /*! A64 OBlock read */
> -    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t elemSize);
> +    virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t
> bti, uint32_t ow_size);
>      /*! A64 OBlock write */
> -    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t
> elemSize);
> +    virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t
> ow_size);
> 
>      GBE_CLASS(GenEncoder); //!< Use custom allocators
>      virtual void alu3(uint32_t opcode, GenRegister dst,
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 1cd6137..223c384 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -759,13 +759,13 @@ namespace gbe
>      void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src,
>                        GenRegister tmpData1, GenRegister tmpData2);
>      /*! Oblock read */
> -    void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr,
> GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
> +    void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header,
> uint32_t bti, uint32_t ow_size);
>      /*! Oblock write */
> -    void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size,
> GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size);
> +    void OBWRITE(GenRegister header, GenRegister* values, uint32_t
> tmp_size, uint32_t bti, uint32_t ow_size);
>      /*! Media block read */
> -    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister
> coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> +    void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header,
> uint32_t bti, uint32_t response_size);
>      /*! Media block write */
> -    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister*
> values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> +    void MBWRITE(GenRegister header, GenRegister* values, uint32_t
> tmp_size, uint32_t bti, uint32_t data_size);
> 
>      /* common functions for both binary instruction and sel_cmp and
> compare instruction.
>         It will handle the IMM or normal register assignment, and will try to avoid
> LOADI
> @@ -2267,118 +2267,84 @@ namespace gbe
>    }
>    void Selection::Opaque::OBREAD(GenRegister* dsts,
>                                   uint32_t vec_size,
> -                                 GenRegister addr,
>                                   GenRegister header,
>                                   uint32_t bti,
> -                                 GenRegister* tmp,
> -                                 uint32_t tmp_size) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 +
> vec_size + tmp_size, 1);
> +                                 uint32_t ow_size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size,
> 1);
>      SelectionVector *vector = this->appendVector();
> -    insn->dst(0) = header;
> +    insn->src(0) = header;
>      for (uint32_t i = 0; i < vec_size; ++i)
> -      insn->dst(1 + i) = dsts[i];
> -    for (uint32_t i = 0; i < tmp_size; ++i)
> -      insn->dst(1 + i + vec_size) = tmp[i];
> -    insn->src(0) = addr;
> +      insn->dst(i) = dsts[i];
>      insn->setbti(bti);
> -    insn->extra.elem = vec_size; // number of vector size
> +    insn->extra.elem = ow_size; // number of OWord size
> 
>      // tmp regs for OWORD read dst
> -    vector->regNum = tmp_size;
> -    vector->reg = &insn->dst(1 + vec_size);
> -    vector->offsetID = 1 + vec_size;
> +    vector->regNum = vec_size;
> +    vector->reg = &insn->dst(0);
> +    vector->offsetID = 0;
>      vector->isSrc = 0;
>    }
> 
> -  void Selection::Opaque::OBWRITE(GenRegister addr,
> +  void Selection::Opaque::OBWRITE(GenRegister header,
>                                    GenRegister* values,
>                                    uint32_t vec_size,
> -                                  GenRegister header,
>                                    uint32_t bti,
> -                                  GenRegister* tmp,
> -                                  uint32_t tmp_size) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE,
> tmp_size + 1, vec_size + 1);
> +                                  uint32_t ow_size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0,
> vec_size + 1);
>      SelectionVector *vector = this->appendVector();
> -    insn->src(0) = addr;
> +    insn->src(0) = header;
>      for (uint32_t i = 0; i < vec_size; ++i)
>        insn->src(i + 1) = values[i];
> -    insn->dst(0) = header;
> -    for (uint32_t i = 0; i < tmp_size; ++i)
> -      insn->dst(i + 1) = tmp[i];
>      insn->setbti(bti);
> -    insn->extra.elem = vec_size; // number of vector_size
> +    insn->extra.elem = ow_size; // number of OWord_size
> 
> -    // tmp regs for OWORD read dst
> -    vector->regNum = tmp_size + 1;
> -    vector->reg = &insn->dst(0);
> +    // tmp regs for OWORD write header and values
> +    vector->regNum = vec_size + 1;
> +    vector->reg = &insn->src(0);
>      vector->offsetID = 0;
> -    vector->isSrc = 0;
> +    vector->isSrc = 1;
> +
>    }
> 
>    void Selection::Opaque::MBREAD(GenRegister* dsts,
> -                                 GenRegister coordx,
> -                                 GenRegister coordy,
> +                                 uint32_t tmp_size,
>                                   GenRegister header,
> -                                 GenRegister* tmp,
>                                   uint32_t bti,
> -                                 uint32_t vec_size) {
> -
> -    uint32_t simdWidth = curr.execWidth;
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size
> * simdWidth / 8 + 1, 2);
> -    insn->dst(0) = header;
> -    for (uint32_t i = 0; i < vec_size; ++i) {
> -      insn->dst(i + 1) = dsts[i];
> -      if(simdWidth == 16)
> -        insn->dst(i + vec_size + 1) = tmp[i];
> -    }
> -    insn->src(0) = coordx;
> -    insn->src(1) = coordy;
> +                                 uint32_t response_size) {
> +
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD,
> tmp_size, 1);
> +    insn->src(0) = header;
>      insn->setbti(bti);
> -    insn->extra.elem = vec_size; // vector size
> +    insn->extra.elem = response_size; // send response length
> 
> -    // Only in simd 8 the data is in vector form
> -    if(simdWidth == 8) {
> -      SelectionVector *vector = this->appendVector();
> -      vector->regNum = vec_size;
> -      vector->reg = &insn->dst(1);
> -      vector->offsetID = 1;
> -      vector->isSrc = 0;
> -    }
> -    if(simdWidth == 16)
> -    {
> -      SelectionVector *vectortmp = this->appendVector();
> -      vectortmp->regNum = vec_size;
> -      vectortmp->reg = &insn->dst(vec_size + 1);
> -      vectortmp->offsetID = vec_size + 1;
> -      vectortmp->isSrc = 0;
> +    for (uint32_t i = 0; i < tmp_size; ++i) {
> +      insn->dst(i) = dsts[i];
>      }
> +    SelectionVector *vector = this->appendVector();
> +    vector->regNum = tmp_size;
> +    vector->reg = &insn->dst(0);
> +    vector->offsetID = 0;
> +    vector->isSrc = 0;
>    }
> 
> -  void Selection::Opaque::MBWRITE(GenRegister coordx,
> -                                  GenRegister coordy,
> +  void Selection::Opaque::MBWRITE(GenRegister header,
>                                    GenRegister* values,
> -                                  GenRegister header,
> -                                  GenRegister* tmp,
> +                                  uint32_t tmp_size,
>                                    uint32_t bti,
> -                                  uint32_t vec_size) {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 +
> vec_size, 2 + vec_size);
> +                                  uint32_t data_size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 +
> tmp_size);
>      SelectionVector *vector = this->appendVector();
> -    insn->src(0) = coordx;
> -    insn->src(1) = coordy;
> -    for (uint32_t i = 0; i < vec_size; ++i)
> -      insn->src(2 + i) = values[i];
> -    insn->dst(0) = header;
> -    for (uint32_t i = 0; i < vec_size; ++i)
> -      insn->dst(1 + i) = tmp[i];
> -    insn->state = this->curr;
> +    insn->src(0) = header;
> +    for (uint32_t i = 0; i < tmp_size; ++i)
> +      insn->src(1 + i) = values[i];
>      insn->setbti(bti);
> -    insn->extra.elem = vec_size; // vector size
> +    insn->extra.elem = data_size; // msg data part size
> 
>      // We need to put the header and the data together
> -    vector->regNum = 1 + vec_size;
> -    vector->reg = &insn->dst(0);
> +    vector->regNum = 1 + tmp_size;
> +    vector->reg = &insn->src(0);
>      vector->offsetID = 0;
> -    vector->isSrc = 0;
> +    vector->isSrc = 1;
>    }
> 
>    // Boiler plate to initialize the selection library at c++ pre-main
> @@ -4715,18 +4681,79 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        const uint32_t simdWidth = sel.ctx.getSimdWidth();
>        const Type type = insn.getValueType();
>        const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> -      const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> +      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD :
> GEN_TYPE_UW;
> +      const RegisterFamily family = getFamily(type);
> +      bool isA64 = SI == 255;
> +
> +      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
>        vector<GenRegister> valuesVec;
> +      vector<GenRegister> tmpVec;
>        for(uint32_t i = 0; i < vec_size; i++)
>          valuesVec.push_back(sel.selReg(insn.getValue(i), type));
> -      // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
> -      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
> -      tmp_size = tmp_size == 0 ? 1 : tmp_size;
> -      tmp_size = tmp_size > 4 ? 4 : tmp_size;
> -      vector<GenRegister> tmpVec;
> +
> +      GenRegister headeraddr;
> +      if (isA64)
> +        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0),
> GEN_TYPE_UL);
> +      else
> +        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
> +      // Make header
> +      sel.push();
> +      {
> +        // Copy r0 into the header first
> +        sel.curr.execWidth = 8;
> +        sel.curr.predicate = GEN_PREDICATE_NONE;
> +        sel.curr.noMask = 1;
> +        sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> +        // Update the header with the current address
> +        sel.curr.execWidth = 1;
> +
> +        // Put zero in the general state base address
> +        if (isA64)
> +          sel.MOV(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UL));
> +        else {
> +          sel.MOV(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UD));
> +          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
> +        }
> +      }
> +      sel.pop();
> +
> +      /* For block read we need to unpack the block date into values, and for
> different
> +       * simdwidth and vector size with different type size, we may need to
> spilt the
> +       * block read send message.
> +       * We can only get a send message with 5 reg length
> +       * so for different combination we have different message length and
> tmp vector size
> +       *              |  simd8  | simd16 |  simd8 | simd16
> +       *  r0  |header |         |        |        |
> +       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
> +       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
> +       *  r3  |date   | ......  | ...... | ...... |  dw1
> +       *  r4  |date   | ....... | ...... | ...... |  dw1
> +       */
> +
> +      uint32_t totalSize = simdWidth * typeSize * vec_size;
> +      uint32_t valueSize = simdWidth * typeSize;
> +      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
> +      uint32_t msg_num = vec_size / tmp_size;
> +      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
> +
>        for(uint32_t i = 0; i < tmp_size; i++)
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> -      sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0],
> tmp_size);
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)),
> genType));
> +      for (uint32_t i = 0; i < msg_num; i++) {
> +          if (i > 0) {
> +            sel.push();
> +            {
> +              // Update the address in header
> +              sel.curr.execWidth = 1;
> +              sel.ADD(headeraddr, headeraddr, GenRegister::immud(128));
> +            }
> +            sel.pop();
> +          }
> +          sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size);
> +          for (uint32_t j = 0; j < tmp_size; j++)
> +            sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]);
> +      }
> +
>      }
> 
>      // check whether all binded table index point to constant memory
> @@ -5161,18 +5188,87 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        const uint32_t simdWidth = sel.ctx.getSimdWidth();
>        const Type type = insn.getValueType();
>        const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> -      const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> +      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD :
> GEN_TYPE_UW;
> +      const RegisterFamily family = getFamily(type);
> +      bool isA64 = SI == 255;
> +
> +      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
>        vector<GenRegister> valuesVec;
> +      vector<GenRegister> tmpVec;
>        for(uint32_t i = 0; i < vec_size; i++)
>          valuesVec.push_back(sel.selReg(insn.getValue(i), type));
> -      // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
> -      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
> -      tmp_size = tmp_size == 0 ? 1 : tmp_size;
> -      tmp_size = tmp_size > 4 ? 4 : tmp_size;
> -      vector<GenRegister> tmpVec;
> +
> +      GenRegister headeraddr;
> +      if (isA64)
> +        headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0),
> GEN_TYPE_UL);
> +      else
> +        headeraddr = sel.getOffsetReg(header, 0, 2 * 4);
> +      // Make header
> +      sel.push();
> +      {
> +        // Copy r0 into the header first
> +        sel.curr.execWidth = 8;
> +        sel.curr.predicate = GEN_PREDICATE_NONE;
> +        sel.curr.noMask = 1;
> +        sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> +        // Update the header with the current address
> +        sel.curr.execWidth = 1;
> +
> +        // Put zero in the general state base address
> +        if (isA64)
> +          sel.MOV(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UL));
> +        else {
> +          sel.SHR(headeraddr, GenRegister::toUniform(address,
> GEN_TYPE_UD), GenRegister::immud(4));
> +          sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0));
> +        }
> +      }
> +      sel.pop();
> +
> +      /* For block write we need to pack the block date into the tmp, and for
> different
> +       * simdwidth and vector size with different type size, we may need to
> spilt the
> +       * block write send message.
> +       * We can only get a send message with 5 reg length
> +       * so for different combination we have different message length and
> tmp vector size
> +       *              |  simd8  | simd16 |  simd8 | simd16
> +       *  r0  |header |         |        |        |
> +       *  r1  |date   |  w0,w1  |   w0   |   dw0  |  dw0
> +       *  r2  |date   |  w2,w3  |   w1   |   dw1  |  dw0
> +       *  r3  |date   | ......  | ...... | ...... |  dw1
> +       *  r4  |date   | ....... | ...... | ...... |  dw1
> +       */
> +
> +      uint32_t totalSize = simdWidth * typeSize * vec_size;
> +      uint32_t valueSize = simdWidth * typeSize;
> +      uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size;
> +      uint32_t msg_num = vec_size / tmp_size;
> +      uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16);
> +
>        for(uint32_t i = 0; i < tmp_size; i++)
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> -      sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0],
> tmp_size);
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)),
> genType));
> +      for (uint32_t i = 0; i < msg_num; i++) {
> +          for (uint32_t j = 0; j < tmp_size; j++)
> +            sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]);
> +          if (i > 0) {
> +            sel.push();
> +            {
> +              // Update the address in header
> +              sel.curr.execWidth = 1;
> +              sel.ADD(headeraddr, headeraddr, GenRegister::immud(8));
> +            }
> +            sel.pop();
> +          }
> +          sel.push();
> +            // In simd8 mode, when data reg has more than 1 reg, execWidth 8
> will get wrong
> +            // result, so set the execWidth to 16.
> +            sel.curr.execWidth = 16;
> +            sel.curr.predicate = GEN_PREDICATE_NONE;
> +            sel.curr.noMask = 1;
> +            sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size);
> +          sel.pop();
> +      }
> +
> +
>      }
> 
>      virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
> @@ -7662,20 +7758,77 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        uint32_t vec_size = insn.getVectorSize();
>        uint32_t simdWidth = sel.curr.execWidth;
>        const Type type = insn.getType();
> +      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> +      uint32_t response_size = simdWidth * vec_size * typeSize / 32;
> +      // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght
> is still 1
> +      response_size = response_size ? response_size : 1;
> +      uint32_t block_width = typeSize * simdWidth;
> +      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
> +
> +
>        vector<GenRegister> valuesVec;
>        vector<GenRegister> tmpVec;
>        for (uint32_t i = 0; i < vec_size; ++i) {
>          valuesVec.push_back(sel.selReg(insn.getDst(i), type));
> -        if(simdWidth == 16)
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> +        if(simdWidth == 16 && typeSize == 4)
> +          tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
>        }
> -      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
> -      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
> -      const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> -      GenRegister *tmp = NULL;
> -      if(simdWidth == 16)
> -        tmp = &tmpVec[0];
> -      sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp,
> insn.getImageIndex(), insn.getVectorSize());
> +      const GenRegister coordx =
> GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32),
> GEN_TYPE_UD);
> +      const GenRegister coordy =
> GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32),
> GEN_TYPE_UD);
> +      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
> +      const GenRegister offsetx =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD);
> +      const GenRegister offsety =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD);
> +      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4);
> +
> +      // Make header
> +      sel.push();
> +        // Copy r0 into the header first
> +        sel.curr.execWidth = 8;
> +        sel.curr.predicate = GEN_PREDICATE_NONE;
> +        sel.curr.noMask = 1;
> +        sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> +        // Update the header with the coord
> +        sel.curr.execWidth = 1;
> +        sel.MOV(offsetx, coordx);
> +        sel.MOV(offsety, coordy);
> +        // Update block width and height
> +        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
> +      sel.pop();
> +
> +      if (simdWidth * typeSize < 64) {
> +        sel.push();
> +          sel.curr.execWidth = 8;
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          // Now read the data
> +          sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(),
> response_size);
> +        sel.pop();
> +      } else if (simdWidth * typeSize == 64) {
> +        sel.push();
> +          sel.curr.execWidth = 8;
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(),
> vec_size);
> +          for (uint32_t i = 0; i < vec_size; i++)
> +            sel.MOV(valuesVec[i], tmpVec[i]);
> +
> +          // Second half
> +          // Update the header with the coord
> +          sel.curr.execWidth = 1;
> +          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
> +
> +          // Now read the data
> +          sel.curr.execWidth = 8;
> +          sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(),
> vec_size);
> +
> +          // Move the reg to fit vector rule.
> +          for (uint32_t i = 0; i < vec_size; i++)
> +            sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]);
> +        sel.pop();
> +      } else NOT_IMPLEMENTED;
> +
> +
>        return true;
>      }
>      DECL_CTOR(MediaBlockReadInstruction, 1, 1);
> @@ -7689,17 +7842,84 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        using namespace ir;
>        uint32_t vec_size = insn.getVectorSize();
>        const Type type = insn.getType();
> -      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
> -      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
> +      uint32_t simdWidth = sel.curr.execWidth;
> +      const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD :
> GEN_TYPE_UW;
> +      const RegisterFamily family = getFamily(type);
> +      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
> +      // ushort in simd8 will have half reg, but data lenght is still 1
> +      uint32_t data_size = simdWidth * vec_size * typeSize / 32;
> +      data_size = data_size? data_size : 1;
> +      uint32_t block_width = typeSize * simdWidth;
> +      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
> +
> +
>        vector<GenRegister> valuesVec;
>        vector<GenRegister> tmpVec;
> -      for(uint32_t i = 0; i < vec_size; i++)
> -      {
> -        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
> -
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY
> _DWORD)), GEN_TYPE_UD));
> -      }
> -      const GenRegister header =
> GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)),
> GEN_TYPE_UD);
> -      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0],
> insn.getImageIndex(), vec_size);
> +      for (uint32_t i = 0; i < vec_size; ++i) {
> +         valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
> +        if(simdWidth == 16 && typeSize == 4)
> +          tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
> +        else
> +
> tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)),
> genType));
> +       }
> +      const GenRegister coordx =
> GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32),
> GEN_TYPE_UD);
> +      const GenRegister coordy =
> GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32),
> GEN_TYPE_UD);
> +      const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG));
> +      const GenRegister offsetx =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD);
> +      const GenRegister offsety =
> GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD);
> +      const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4);
> +
> +      // Make header
> +      sel.push();
> +        // Copy r0 into the header first
> +        sel.curr.execWidth = 8;
> +        sel.curr.predicate = GEN_PREDICATE_NONE;
> +        sel.curr.noMask = 1;
> +        sel.MOV(header, GenRegister::ud8grf(0, 0));
> +
> +        // Update the header with the coord
> +        sel.curr.execWidth = 1;
> +        sel.MOV(offsetx, coordx);
> +        sel.MOV(offsety, coordy);
> +        // Update block width and height
> +        sel.MOV(blocksizereg, GenRegister::immud(blocksize));
> +      sel.pop();
> +
> +      if (simdWidth * typeSize < 64) {
> +        for (uint32_t i = 0; i < vec_size; ++i) {
> +            sel.MOV(tmpVec[i], valuesVec[i]);
> +        }
> +        sel.push();
> +          sel.curr.execWidth = 8;
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          // Now write the data
> +          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(),
> data_size);
> +        sel.pop();
> +      } else if (simdWidth * typeSize == 64) {
> +        sel.push();
> +          sel.curr.execWidth = 8;
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          for (uint32_t i = 0; i < vec_size; i++)
> +            sel.MOV(tmpVec[i], valuesVec[i]);
> +          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(),
> vec_size);
> +
> +          // Second half
> +          // Update the header with the coord
> +          sel.curr.execWidth = 1;
> +          sel.ADD(offsetx, offsetx, GenRegister::immud(32));
> +
> +          sel.curr.execWidth = 8;
> +          for (uint32_t i = 0; i < vec_size; i++)
> +            sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32));
> +          // Now write the data
> +          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(),
> vec_size);
> +
> +          // Move the reg to fit vector rule.
> +        sel.pop();
> +      } else NOT_IMPLEMENTED;
> +
>        return true;
>      }
>      DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
> --
> 2.7.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list