[Beignet] [PATCH V3 11/12] Backend: Add intel_sub_group_block_read/write form image

Yang, Rong R rong.r.yang at intel.com
Mon Jun 13 07:36:02 UTC 2016


The patchset LGTM, pushed, thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Sunday, June 12, 2016 5:33
> To: beignet at lists.freedesktop.org
> Cc: Pan, Xiuli <xiuli.pan at intel.com>
> Subject: [Beignet] [PATCH V3 11/12] Backend: Add
> intel_sub_group_block_read/write form image
> 
> From: Pan Xiuli <xiuli.pan at intel.com>
> 
> Using meida block read/write to read data in block. In simd16 mode the
> need some reg relocation for later use.
> GEN7 has some different data port.
> V2: Refine block read simd16 with tmp reg to avoide MOVs
> V3: Fix build bug with clang.
> 
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
>  backend/src/backend/gen/gen_mesa_disasm.c          |  27 +++-
>  backend/src/backend/gen7_encoder.cpp               |  48 +++++++
>  backend/src/backend/gen7_encoder.hpp               |   4 +
>  backend/src/backend/gen7_instruction.hpp           |  16 +++
>  backend/src/backend/gen8_instruction.hpp           |  16 +++
>  backend/src/backend/gen_context.cpp                | 155
> +++++++++++++++++++++
>  backend/src/backend/gen_context.hpp                |   2 +
>  backend/src/backend/gen_defs.hpp                   |  16 +++
>  backend/src/backend/gen_encoder.cpp                |  47 +++++++
>  backend/src/backend/gen_encoder.hpp                |   4 +
>  .../src/backend/gen_insn_gen7_schedule_info.hxx    |   2 +
>  backend/src/backend/gen_insn_selection.cpp         | 115 ++++++++++++++-
>  backend/src/backend/gen_insn_selection.hpp         |   4 +
>  backend/src/backend/gen_insn_selection.hxx         |   2 +
>  backend/src/ir/instruction.cpp                     | 112 ++++++++++++++-
>  backend/src/ir/instruction.hpp                     |  22 +++
>  backend/src/ir/instruction.hxx                     |   2 +
>  backend/src/ir/liveness.cpp                        |   3 +-
>  backend/src/libocl/src/ocl_substore.ll             |  33 +++++
>  backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |  21 +++
>  backend/src/libocl/tmpl/ocl_simd.tmpl.h            |  10 ++
>  backend/src/llvm/llvm_gen_backend.cpp              |  62 ++++++++-
>  backend/src/llvm/llvm_gen_ocl_function.hxx         |   8 ++
>  backend/src/llvm/llvm_scalarize.cpp                |  14 ++
>  24 files changed, 732 insertions(+), 13 deletions(-)
> 
> diff --git a/backend/src/backend/gen/gen_mesa_disasm.c
> b/backend/src/backend/gen/gen_mesa_disasm.c
> index 9200c26..9955dfc 100644
> --- a/backend/src/backend/gen/gen_mesa_disasm.c
> +++ b/backend/src/backend/gen/gen_mesa_disasm.c
> @@ -1476,6 +1476,15 @@ int gen_disasm (FILE *file, const void *inst,
> uint32_t deviceID, uint32_t compac
>                   SAMPLER_MSG_TYPE(inst),
>                   SAMPLER_SIMD_MODE(inst));
>            break;
> +        case GEN_SFID_DATAPORT_RENDER:
> +            if(UNTYPED_RW_MSG_TYPE(inst) == 4 ||
> UNTYPED_RW_MSG_TYPE(inst) == 10)
> +              format(file, " (bti: %d, %s, %s)",
> +                     UNTYPED_RW_BTI(inst),
> +
> data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> +
> data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> +            else
> +              format(file, " not implemented");
> +            break;
>          case GEN_SFID_DATAPORT_DATA:
>            if(UNTYPED_RW_CATEGORY(inst) == 0) {
>              if(UNTYPED_RW_MSG_TYPE(inst) == 5 ||
> UNTYPED_RW_MSG_TYPE(inst) == 13)
> @@ -1510,12 +1519,18 @@ int gen_disasm (FILE *file, const void *inst,
> uint32_t deviceID, uint32_t compac
>            }
>            break;
>          case GEN_SFID_DATAPORT1_DATA:
> -          format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
> -                 UNTYPED_RW_BTI(inst),
> -                 UNTYPED_RW_RGBA(inst),
> -
> data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
> -                 data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> -
> data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> +            if(UNTYPED_RW_MSG_TYPE(inst) == 4 ||
> UNTYPED_RW_MSG_TYPE(inst) == 10)
> +              format(file, " (bti: %d, %s, %s)",
> +                     UNTYPED_RW_BTI(inst),
> +
> data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> +
> data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
> +            else
> +              format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
> +                     UNTYPED_RW_BTI(inst),
> +                     UNTYPED_RW_RGBA(inst),
> +
> data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
> +
> data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
> +
> data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
>            break;
>          case GEN_SFID_DATAPORT_CONSTANT:
>            format(file, " (bti: %d, %s)",
> diff --git a/backend/src/backend/gen7_encoder.cpp
> b/backend/src/backend/gen7_encoder.cpp
> index fc358be..abb8b77 100644
> --- a/backend/src/backend/gen7_encoder.cpp
> +++ b/backend/src/backend/gen7_encoder.cpp
> @@ -239,5 +239,53 @@ namespace gbe
>       }
>    }
> 
> +  static void setMBlockRWGEN7(GenEncoder *p,
> +                          GenNativeInstruction *insn,
> +                          uint32_t bti,
> +                          uint32_t msg_type,
> +                          uint32_t msg_length,
> +                          uint32_t response_length)
> +  {
> +    const GenMessageTarget sfid = GEN_SFID_DATAPORT_RENDER;
> +    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> +    insn->bits3.gen7_mblock_rw.msg_type = msg_type;
> +    insn->bits3.gen7_mblock_rw.bti = bti;
> +    insn->bits3.gen7_mblock_rw.header_present = 1;
> +  }
> +
> +
> +  void Gen7Encoder::MBREAD(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    const uint32_t msg_length = 1;
> +    const uint32_t response_length = size; // Size of registers
> +    this->setHeader(insn);
> +    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    setMBlockRWGEN7(this,
> +                insn,
> +                bti,
> +                GEN75_P1_MEDIA_BREAD,
> +                msg_length,
> +                response_length);
> +  }
> +
> +  void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t
> size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    const uint32_t msg_length = 1 + size;
> +    const uint32_t response_length = 0; // Size of registers
> +    this->setHeader(insn);
> +    this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> +    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    setMBlockRWGEN7(this,
> +                insn,
> +                bti,
> +                GEN75_P1_MEDIA_TYPED_BWRITE,
> +                msg_length,
> +                response_length);
> +  }
> +
> +
>  #undef NO_SWIZZLE
>  }
> diff --git a/backend/src/backend/gen7_encoder.hpp
> b/backend/src/backend/gen7_encoder.hpp
> index 1276c67..edb711d 100644
> --- a/backend/src/backend/gen7_encoder.hpp
> +++ b/backend/src/backend/gen7_encoder.hpp
> @@ -42,6 +42,10 @@ namespace gbe
>      virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
>      virtual void alu3(uint32_t opcode, GenRegister dst,
>                         GenRegister src0, GenRegister src1, GenRegister src2);
> +    /*! MBlock read */
> +    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
> +    /*! MBlock write */
> +    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t
> elemSize);
>    };
>  }
>  #endif /* __GBE_GEN7_ENCODER_HPP__ */
> diff --git a/backend/src/backend/gen7_instruction.hpp
> b/backend/src/backend/gen7_instruction.hpp
> index 258dd24..7d7eada 100644
> --- a/backend/src/backend/gen7_instruction.hpp
> +++ b/backend/src/backend/gen7_instruction.hpp
> @@ -531,6 +531,22 @@ union Gen7NativeInstruction
>          uint32_t uip:16;
>        } gen7_branch;
> 
> +      /*! Data port Media block read / write */
> +      struct {
> +        uint32_t bti:8;
> +        uint32_t ver_line_stride_offset:1;
> +        uint32_t ver_line_stride:1;
> +        uint32_t ver_line_stride_override:1;
> +        uint32_t ignored:3;
> +        uint32_t msg_type:4;
> +        uint32_t category:1;
> +        uint32_t header_present:1;
> +        uint32_t response_length:5;
> +        uint32_t msg_length:4;
> +        uint32_t pad2:2;
> +        uint32_t end_of_thread:1;
> +      } gen7_mblock_rw;
> +
>        int d;
>        uint32_t ud;
>        float f;
> diff --git a/backend/src/backend/gen8_instruction.hpp
> b/backend/src/backend/gen8_instruction.hpp
> index ada9ffc..549948a 100644
> --- a/backend/src/backend/gen8_instruction.hpp
> +++ b/backend/src/backend/gen8_instruction.hpp
> @@ -608,6 +608,22 @@ union Gen8NativeInstruction
>          uint32_t jip:32;
>        } gen8_branch;
> 
> +      /*! Data port Media block read / write */
> +      struct {
> +        uint32_t bti:8;
> +        uint32_t ver_line_stride_offset:1;
> +        uint32_t ver_line_stride:1;
> +        uint32_t ver_line_stride_override:1;
> +        uint32_t ignored:3;
> +        uint32_t msg_type:4;
> +        uint32_t category:1;
> +        uint32_t header_present:1;
> +        uint32_t response_length:5;
> +        uint32_t msg_length:4;
> +        uint32_t pad2:2;
> +        uint32_t end_of_thread:1;
> +      } gen7_mblock_rw;
> +
>        int d;
>        uint32_t ud;
>        float f;
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 90b8b45..98a94ba 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3538,6 +3538,161 @@ namespace gbe
>      p->OBWRITE(header, insn.getbti(), insn.extra.elem);
>    }
> 
> +  void GenContext::emitMBReadInstruction(const SelectionInstruction
> &insn) {
> +    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const GenRegister coordx = GenRegister::toUniform(ra-
> >genReg(insn.src(0)),GEN_TYPE_D);
> +    const GenRegister coordy = GenRegister::toUniform(ra-
> >genReg(insn.src(1)),GEN_TYPE_D);
> +    GenRegister header, offsetx, offsety, blocksizereg;
> +    if (simdWidth == 8)
> +      header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
> +    else
> +      header = GenRegister::retype(GenRegister::Qn(ra-
> >genReg(insn.src(2)),1), GEN_TYPE_UD);
> +
> +    offsetx = GenRegister::offset(header, 0, 0*4);
> +    offsety = GenRegister::offset(header, 0, 1*4);
> +    blocksizereg = GenRegister::offset(header, 0, 2*4);
> +    size_t vec_size = insn.extra.elem;
> +    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
> +
> +    if (simdWidth == 8)
> +    {
> +      p->push();
> +        // Copy r0 into the header first
> +        p->curr.execWidth = 8;
> +        p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
> +        p->MOV(header, GenRegister::ud8grf(0,0));
> +
> +        // Update the header with the coord
> +        p->curr.execWidth = 1;
> +        p->MOV(offsetx, coordx);
> +        p->MOV(offsety, coordy);
> +        // Update block width and height
> +        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> +        // Now read the data
> +        p->curr.execWidth = 8;
> +        p->MBREAD(dst, header, insn.getbti(), vec_size);
> +      p->pop();
> +
> +    }
> +    else
> +    {
> +      const GenRegister tmp = ra->genReg(insn.dst(vec_size));
> +      p->push();
> +        // Copy r0 into the header first
> +        p->curr.execWidth = 8;
> +        p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
> +        p->MOV(header, GenRegister::ud8grf(0,0));
> +
> +        // First half
> +        // Update the header with the coord
> +        p->curr.execWidth = 1;
> +        p->MOV(offsetx, coordx);
> +        p->MOV(offsety, coordy);
> +        // Update block width and height
> +        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> +        // Now read the data
> +        p->curr.execWidth = 8;
> +        p->MBREAD(tmp, header, insn.getbti(), vec_size);
> +
> +        // Second half
> +        // Update the header with the coord
> +        p->curr.execWidth = 1;
> +        p->ADD(offsetx, offsetx, GenRegister::immud(32));
> +
> +        const GenRegister tmp2 = GenRegister::offset(tmp, vec_size);
> +        // Now read the data
> +        p->curr.execWidth = 8;
> +        p->MBREAD(tmp2, header, insn.getbti(), vec_size);
> +
> +        // Move the reg to fit vector rule.
> +        for (int i = 0; i < vec_size; i++) {
> +          p->MOV(GenRegister::offset(dst, i * 2), GenRegister::offset(tmp, i));
> +          p->MOV(GenRegister::offset(dst, i * 2 + 1),
> +                 GenRegister::offset(tmp2, i));
> +        }
> +      p->pop();
> +    }
> +  }
> +
> +  void GenContext::emitMBWriteInstruction(const SelectionInstruction
> &insn) {
> +    const GenRegister coordx = GenRegister::toUniform(ra-
> >genReg(insn.src(0)), GEN_TYPE_D);
> +    const GenRegister coordy = GenRegister::toUniform(ra-
> >genReg(insn.src(1)), GEN_TYPE_D);
> +    GenRegister header, offsetx, offsety, blocksizereg;
> +    size_t vec_size = insn.extra.elem;
> +    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
> +    if (simdWidth == 8)
> +      header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
> +    else
> +      header = GenRegister::retype(GenRegister::Qn(ra-
> >genReg(insn.dst(0)),1), GEN_TYPE_UD);
> +
> +    offsetx = GenRegister::offset(header, 0, 0*4);
> +    offsety = GenRegister::offset(header, 0, 1*4);
> +    blocksizereg = GenRegister::offset(header, 0, 2*4);
> +
> +    if (simdWidth == 8)
> +    {
> +      p->push();
> +        // Copy r0 into the header first
> +        p->curr.execWidth = 8;
> +        p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
> +        p->MOV(header, GenRegister::ud8grf(0,0));
> +
> +        // Update the header with the coord
> +        p->curr.execWidth = 1;
> +        p->MOV(offsetx, coordx);
> +        p->MOV(offsety, coordy);
> +        // Update block width and height
> +        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> +        p->curr.execWidth = 8;
> +        // Mov what we need into msgs
> +        for(uint32_t i = 0; i < vec_size; i++)
> +          p->MOV(ra->genReg(insn.dst(1 + i)), ra->genReg(insn.src(2 + i)));
> +        // Now read the data
> +        p->MBWRITE(header, insn.getbti(), vec_size);
> +      p->pop();
> +
> +    }
> +    else
> +    {
> +      p->push();
> +        // Copy r0 into the header first
> +        p->curr.execWidth = 8;
> +        p->curr.predicate = GEN_PREDICATE_NONE;
> +        p->curr.noMask = 1;
> +        p->MOV(header, GenRegister::ud8grf(0,0));
> +
> +        // First half
> +        // Update the header with the coord
> +        p->curr.execWidth = 1;
> +        p->MOV(offsetx, coordx);
> +        p->MOV(offsety, coordy);
> +        // Update block width and height
> +        p->MOV(blocksizereg, GenRegister::immud(blocksize));
> +        // Now read the data
> +        p->curr.execWidth = 8;
> +        // Mov what we need into msgs
> +        for(uint32_t i = 0; i < vec_size; i++)
> +          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 +
> i)));
> +        p->MBWRITE(header, insn.getbti(), vec_size);
> +
> +        // Second half
> +        // Update the header with the coord
> +        p->curr.execWidth = 1;
> +        p->ADD(offsetx, offsetx, GenRegister::immud(32));
> +
> +        p->curr.execWidth = 8;
> +        // Mov what we need into msgs
> +        for(uint32_t i = 0; i < vec_size; i++)
> +          p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra-
> >genReg(insn.src(2 + i)), 1));
> +        // Now write the data
> +        p->MBWRITE(header, insn.getbti(), vec_size);
> +
> +      p->pop();
> +    }
> +  }
> 
>    BVAR(OCL_OUTPUT_REG_ALLOC, false);
>    BVAR(OCL_OUTPUT_ASM, false);
> diff --git a/backend/src/backend/gen_context.hpp
> b/backend/src/backend/gen_context.hpp
> index a634338..fb3d4fe 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -189,6 +189,8 @@ namespace gbe
>      void afterMessage(const SelectionInstruction &insn, GenRegister bti,
> GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
>      void emitOBReadInstruction(const SelectionInstruction &insn);
>      void emitOBWriteInstruction(const SelectionInstruction &insn);
> +    void emitMBReadInstruction(const SelectionInstruction &insn);
> +    void emitMBWriteInstruction(const SelectionInstruction &insn);
> 
>      /*! Implements base class */
>      virtual Kernel *allocateKernel(void);
> diff --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> index 09cb2ba..66ae5b5 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -784,6 +784,22 @@ union GenNativeInstruction
>          uint32_t jip:32;
>        } gen8_branch;
> 
> +      /*! Data port Media block read / write */
> +      struct {
> +        uint32_t bti:8;
> +        uint32_t ver_line_stride_offset:1;
> +        uint32_t ver_line_stride:1;
> +        uint32_t ver_line_stride_override:1;
> +        uint32_t ignored:3;
> +        uint32_t msg_type:4;
> +        uint32_t category:1;
> +        uint32_t header_present:1;
> +        uint32_t response_length:5;
> +        uint32_t msg_length:4;
> +        uint32_t pad2:2;
> +        uint32_t end_of_thread:1;
> +      } gen7_mblock_rw;
> +
>        int d;
>        uint32_t ud;
>        float f;
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index e745b9c..eb9fbeb 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -276,6 +276,21 @@ namespace gbe
>      insn->bits3.gen7_oblock_rw.header_present = 1;
>    }
> 
> +  static void setMBlockRW(GenEncoder *p,
> +                          GenNativeInstruction *insn,
> +                          uint32_t bti,
> +                          uint32_t msg_type,
> +                          uint32_t msg_length,
> +                          uint32_t response_length)
> +  {
> +    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
> +    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
> +    insn->bits3.gen7_mblock_rw.msg_type = msg_type;
> +    insn->bits3.gen7_mblock_rw.bti = bti;
> +    insn->bits3.gen7_mblock_rw.header_present = 1;
> +  }
> +
> +
>    static void setDWordScatterMessgae(GenEncoder *p,
>                                       GenNativeInstruction *insn,
>                                       uint32_t bti,
> @@ -1277,6 +1292,38 @@ namespace gbe
>                  response_length);
>    }
> 
> +  void GenEncoder::MBREAD(GenRegister dst, GenRegister header,
> uint32_t bti, uint32_t size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    const uint32_t msg_length = 1;
> +    const uint32_t response_length = size; // Size of registers
> +    this->setHeader(insn);
> +    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
> +    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    setMBlockRW(this,
> +                insn,
> +                bti,
> +                GEN75_P1_MEDIA_BREAD,
> +                msg_length,
> +                response_length);
> +  }
> +
> +  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t
> size) {
> +    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> +    const uint32_t msg_length = 1 + size;
> +    const uint32_t response_length = 0; // Size of registers
> +    this->setHeader(insn);
> +    this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UW));
> +    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
> +    this->setSrc1(insn, GenRegister::immud(0));
> +    setMBlockRW(this,
> +                insn,
> +                bti,
> +                GEN75_P1_MEDIA_TYPED_BWRITE,
> +                msg_length,
> +                response_length);
> +  }
> +
>    void GenEncoder::EOT(uint32_t msg) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      this->setDst(insn, GenRegister::retype(GenRegister::null(),
> GEN_TYPE_UD));
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index a53c879..4979305 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -271,6 +271,10 @@ namespace gbe
>      void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t
> elemSize);
>      /*! OBlock write */
>      void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
> +    /*! MBlock read */
> +    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti,
> uint32_t elemSize);
> +    /*! MBlock write */
> +    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t
> elemSize);
> 
>      GBE_CLASS(GenEncoder); //!< Use custom allocators
>      virtual void alu3(uint32_t opcode, GenRegister dst,
> diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> index d297726..c396626 100644
> --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> @@ -52,3 +52,5 @@ DECL_GEN7_SCHEDULE(SubGroupOp,      80,        1,        1)
>  DECL_GEN7_SCHEDULE(Printf,          80,        1,        1)
>  DECL_GEN7_SCHEDULE(OBRead,          80,        1,        1)
>  DECL_GEN7_SCHEDULE(OBWrite,         80,        1,        1)
> +DECL_GEN7_SCHEDULE(MBRead,          80,        1,        1)
> +DECL_GEN7_SCHEDULE(MBWrite,         80,        1,        1)
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index e974e97..d3c5a40c 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -189,7 +189,8 @@ namespace gbe
>             this->opcode == SEL_OP_SAMPLE ||
>             this->opcode == SEL_OP_VME ||
>             this->opcode == SEL_OP_DWORD_GATHER ||
> -           this->opcode == SEL_OP_OBREAD;
> +           this->opcode == SEL_OP_OBREAD ||
> +           this->opcode == SEL_OP_MBREAD;
>    }
> 
>    bool SelectionInstruction::modAcc(void) const {
> @@ -212,7 +213,8 @@ namespace gbe
>             this->opcode == SEL_OP_ATOMIC        ||
>             this->opcode == SEL_OP_BYTE_SCATTER  ||
>             this->opcode == SEL_OP_TYPED_WRITE ||
> -           this->opcode == SEL_OP_OBWRITE;
> +           this->opcode == SEL_OP_OBWRITE ||
> +           this->opcode == SEL_OP_MBWRITE;
>    }
> 
>    bool SelectionInstruction::isBranch(void) const {
> @@ -703,6 +705,10 @@ namespace gbe
>      void OBREAD(GenRegister dst, GenRegister addr, GenRegister header,
> uint32_t bti, uint32_t size);
>      /*! Oblock write */
>      void OBWRITE(GenRegister addr, GenRegister value, GenRegister header,
> uint32_t bti, uint32_t size);
> +    /*! Media block read */
> +    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister
> coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> +    /*! Media block write */
> +    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister*
> values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t
> vec_size);
> 
>      /* common functions for both binary instruction and sel_cmp and
> compare instruction.
>         It will handle the IMM or normal register assignment, and will try to avoid
> LOADI
> @@ -2055,6 +2061,63 @@ namespace gbe
>      vector->isSrc = 1;
>    }
> 
> +  void Selection::Opaque::MBREAD(GenRegister* dsts,
> +                                 GenRegister coordx,
> +                                 GenRegister coordy,
> +                                 GenRegister header,
> +                                 GenRegister* tmp,
> +                                 uint32_t bti,
> +                                 uint32_t vec_size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD,
> vec_size * 2, 3);
> +    SelectionVector *vector = this->appendVector();
> +    SelectionVector *vectortmp = this->appendVector();
> +    for (uint32_t i = 0; i < vec_size; ++i) {
> +      insn->dst(i) = dsts[i];
> +      insn->dst(i + vec_size) = tmp[i];
> +    }
> +    insn->src(0) = coordx;
> +    insn->src(1) = coordy;
> +    insn->src(2) = header;
> +    insn->setbti(bti);
> +    insn->extra.elem = vec_size; // vector size
> +
> +    vector->regNum = vec_size;
> +    vector->reg = &insn->dst(0);
> +    vector->offsetID = 0;
> +    vector->isSrc = 0;
> +    vectortmp->regNum = vec_size;
> +    vectortmp->reg = &insn->dst(vec_size);
> +    vectortmp->offsetID = 0;
> +    vectortmp->isSrc = 0;
> +
> +  }
> +
> +  void Selection::Opaque::MBWRITE(GenRegister coordx,
> +                                  GenRegister coordy,
> +                                  GenRegister* values,
> +                                  GenRegister header,
> +                                  GenRegister* tmp,
> +                                  uint32_t bti,
> +                                  uint32_t vec_size) {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 +
> vec_size, 2 + vec_size);
> +    SelectionVector *vector = this->appendVector();
> +    insn->src(0) = coordx;
> +    insn->src(1) = coordy;
> +    for (uint32_t i = 0; i < vec_size; ++i)
> +      insn->src(2 + i) = values[i];
> +    insn->dst(0) = header;
> +    for (uint32_t i = 0; i < vec_size; ++i)
> +      insn->dst(1 + i) = tmp[i];
> +    insn->state = this->curr;
> +    insn->setbti(bti);
> +    insn->extra.elem = vec_size; // vector size
> +
> +    // We need to put the header and the data together
> +    vector->regNum = 1 + vec_size;
> +    vector->reg = &insn->dst(0);
> +    vector->offsetID = 0;
> +    vector->isSrc = 0;
> +  }
> 
>    // Boiler plate to initialize the selection library at c++ pre-main
>    static SelectionLibrary *selLib = NULL;
> @@ -6583,6 +6646,52 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>      }
>    };
> 
> +  /*! Media Block Read pattern */
> +  DECL_PATTERN(MediaBlockReadInstruction)
> +  {
> +    bool emitOne(Selection::Opaque &sel, const
> ir::MediaBlockReadInstruction &insn, bool &markChildren) const
> +    {
> +      using namespace ir;
> +      uint32_t vec_size = insn.getVectorSize();
> +      vector<GenRegister> valuesVec;
> +      vector<GenRegister> tmpVec;
> +      for (uint32_t i = 0; i < vec_size; ++i) {
> +        valuesVec.push_back(sel.selReg(insn.getSrc(i), TYPE_U32));
> +        tmpVec.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
> +      }
> +      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
> +      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
> +      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> +      sel.MBREAD(values, coordx, coordy, header, tmp, insn.getImageIndex(),
> insn.getVectorSize());
> +      return true;
> +    }
> +    DECL_CTOR(MediaBlockReadInstruction, 1, 1);
> +  };
> +
> +  /*! Media Block Write pattern */
> +  DECL_PATTERN(MediaBlockWriteInstruction)
> +  {
> +    bool emitOne(Selection::Opaque &sel, const
> ir::MediaBlockWriteInstruction &insn, bool &markChildren) const
> +    {
> +      using namespace ir;
> +      uint32_t vec_size = insn.getVectorSize();
> +      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
> +      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
> +      vector<GenRegister> valuesVec;
> +      vector<GenRegister> tmpVec;
> +      for(uint32_t i = 0; i < vec_size; i++)
> +      {
> +        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
> +        tmpVec.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
> +      }
> +      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD),
> TYPE_U32);
> +      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0],
> insn.getImageIndex(), vec_size);
> +      return true;
> +    }
> +    DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
> +  };
> +
> +
>    /*! Sort patterns */
>    INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
>      if (p0->insnNum != p1->insnNum)
> @@ -6624,6 +6733,8 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>      this->insert<NullaryInstructionPattern>();
>      this->insert<WaitInstructionPattern>();
>      this->insert<PrintfInstructionPattern>();
> +    this->insert<MediaBlockReadInstructionPattern>();
> +    this->insert<MediaBlockWriteInstructionPattern>();
> 
>      // Sort all the patterns with the number of instructions they output
>      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 51af686..b481de8 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -177,6 +177,8 @@ namespace gbe
>        switch (opcode) {
>          case SEL_OP_OBREAD:
>          case SEL_OP_OBWRITE:
> +        case SEL_OP_MBREAD:
> +        case SEL_OP_MBWRITE:
>          case SEL_OP_DWORD_GATHER: return extra.function;
>          case SEL_OP_SAMPLE: return extra.rdbti;
>          case SEL_OP_VME: return extra.vme_bti;
> @@ -192,6 +194,8 @@ namespace gbe
>        switch (opcode) {
>          case SEL_OP_OBREAD:
>          case SEL_OP_OBWRITE:
> +        case SEL_OP_MBREAD:
> +        case SEL_OP_MBWRITE:
>          case SEL_OP_DWORD_GATHER: extra.function = bti; return;
>          case SEL_OP_SAMPLE: extra.rdbti = bti; return;
>          case SEL_OP_VME: extra.vme_bti = bti; return;
> diff --git a/backend/src/backend/gen_insn_selection.hxx
> b/backend/src/backend/gen_insn_selection.hxx
> index 4a7caff..ccaf526 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -98,3 +98,5 @@ DECL_SELECTION_IR(SUBGROUP_OP,
> SubGroupOpInstruction)
>  DECL_SELECTION_IR(PRINTF, PrintfInstruction)
>  DECL_SELECTION_IR(OBREAD, OBReadInstruction)
>  DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
> +DECL_SELECTION_IR(MBREAD, MBReadInstruction)
> +DECL_SELECTION_IR(MBWRITE, MBWriteInstruction)
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index 88491a7..ed64580 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -1064,6 +1064,78 @@ namespace ir {
>          Register dst[1];
>      };
> 
> +    class ALIGNED_INSTRUCTION MediaBlockReadInstruction :
> +      public BasePolicy,
> +      public TupleSrcPolicy<MediaBlockReadInstruction>,
> +      public TupleDstPolicy<MediaBlockReadInstruction>
> +    {
> +    public:
> +      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t
> vec_size, Tuple srcTuple, uint8_t srcNum) {
> +        this->opcode = OP_MBREAD;
> +        this->dst = dst;
> +        this->dstNum = vec_size;
> +        this->src = srcTuple;
> +        this->srcNum = srcNum;
> +        this->imageIdx = imageIdx;
> +      }
> +      INLINE bool wellFormed(const Function &fn, std::string &why) const;
> +      INLINE void out(std::ostream &out, const Function &fn) const {
> +        this->outOpcode(out);
> +        out << (int)this->getVectorSize();
> +        out << " {";
> +        for (uint32_t i = 0; i < dstNum; ++i)
> +          out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
> +        out << "}";
> +        out << " 2D surface id " << (int)this->getImageIndex()
> +            << " byte coord x %" << this->getSrc(fn, 0)
> +            << " row coord y %" << this->getSrc(fn, 1);
> +      }
> +      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
> +      INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
> +
> +      Tuple src;
> +      Tuple dst;
> +      uint8_t imageIdx;
> +      uint8_t srcNum;
> +      uint8_t dstNum;
> +    };
> +
> +    class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
> +      public BasePolicy,
> +      public TupleSrcPolicy<MediaBlockWriteInstruction>,
> +      public NDstPolicy<MediaBlockWriteInstruction, 0>
> +    {
> +    public:
> +
> +      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple,
> uint8_t srcNum, uint8_t vec_size) {
> +        this->opcode = OP_MBWRITE;
> +        this->src = srcTuple;
> +        this->srcNum = srcNum;
> +        this->imageIdx = imageIdx;
> +        this->vec_size = vec_size;
> +      }
> +      INLINE bool wellFormed(const Function &fn, std::string &why) const;
> +      INLINE void out(std::ostream &out, const Function &fn) const {
> +        this->outOpcode(out);
> +        out << (int)this->getVectorSize()
> +            << " 2D surface id " << (int)this->getImageIndex()
> +            << " byte coord x %" << this->getSrc(fn, 0)
> +            << " row coord y %" << this->getSrc(fn, 1);
> +        out << " {";
> +        for (uint32_t i = 0; i < vec_size; ++i)
> +          out << "%" << this->getSrc(fn, i + 2) << (i != (vec_size-1u) ? " " : "");
> +        out << "}";
> +      }
> +      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
> +      INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
> +
> +      Tuple src;
> +      Register dst[0];
> +      uint8_t imageIdx;
> +      uint8_t srcNum;
> +      uint8_t vec_size;
> +    };
> +
>  #undef ALIGNED_INSTRUCTION
> 
>      /////////////////////////////////////////////////////////////////////////
> @@ -1591,6 +1663,22 @@ namespace ir {
>        return true;
>      }
> 
> +    INLINE bool MediaBlockReadInstruction::wellFormed(const Function &fn,
> std::string &whyNot) const {
> +      if (this->srcNum != 2) {
> +        whyNot = "Wrong number of source.";
> +        return false;
> +      }
> +      return true;
> +    }
> +
> +    INLINE bool MediaBlockWriteInstruction::wellFormed(const Function &fn,
> std::string &whyNot) const {
> +      if (this->srcNum != 2 + this->vec_size) {
> +        whyNot = "Wrong number of source.";
> +        return false;
> +      }
> +      return true;
> +    }
> +
>  #undef CHECK_TYPE
> 
>      /////////////////////////////////////////////////////////////////////////
> @@ -2058,6 +2146,14 @@ START_INTROSPECTION(PrintfInstruction)
>  #include "ir/instruction.hxx"
>  END_INTROSPECTION(PrintfInstruction)
> 
> +START_INTROSPECTION(MediaBlockReadInstruction)
> +#include "ir/instruction.hxx"
> +END_INTROSPECTION(MediaBlockReadInstruction)
> +
> +START_INTROSPECTION(MediaBlockWriteInstruction)
> +#include "ir/instruction.hxx"
> +END_INTROSPECTION(MediaBlockWriteInstruction)
> +
>  #undef END_INTROSPECTION
>  #undef START_INTROSPECTION
>  #undef DECL_INSN
> @@ -2205,7 +2301,8 @@ END_FUNCTION(Instruction, Register)
>             opcode == OP_CALC_TIMESTAMP ||
>             opcode == OP_STORE_PROFILING ||
>             opcode == OP_WAIT ||
> -           opcode == OP_PRINTF;
> +           opcode == OP_PRINTF ||
> +           opcode == OP_MBWRITE;
>    }
> 
>  #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
> @@ -2275,6 +2372,10 @@ DECL_MEM_FN(SubGroupInstruction,
> WorkGroupOps, getWorkGroupOpcode(void), getWork
>  DECL_MEM_FN(PrintfInstruction, uint32_t, getNum(void), getNum())
>  DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
>  DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn,
> uint32_t ID), getType(fn, ID))
> +DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void),
> getImageIndex())
> +DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void),
> getVectorSize())
> +DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void),
> getImageIndex())
> +DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void),
> getVectorSize())
> 
>  #undef DECL_MEM_FN
> 
> @@ -2582,6 +2683,15 @@ DECL_MEM_FN(MemInstruction, void,
> setBtiReg(Register reg), setBtiReg(reg))
>      return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti,
> num).convert();
>    }
> 
> +  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size,
> Tuple coord, uint8_t srcNum) {
> +    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size,
> coord, srcNum).convert();
> +  }
> +
> +  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum,
> uint8_t vec_size) {
> +    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple,
> srcNum, vec_size).convert();
> +  }
> +
> +
>    std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
>      const Function &fn = insn.getFunction();
>      const BasicBlock *bb = insn.getParent();
> diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
> index 4e7d5b7..b2b0b49 100644
> --- a/backend/src/ir/instruction.hpp
> +++ b/backend/src/ir/instruction.hpp
> @@ -635,6 +635,24 @@ namespace ir {
>      static bool isClassOf(const Instruction &insn);
>    };
> 
> +  /*! Media Block Read.  */
> +  class MediaBlockReadInstruction : public Instruction {
> +  public:
> +    /*! Return true if the given instruction is an instance of this class */
> +    static bool isClassOf(const Instruction &insn);
> +    uint8_t getImageIndex() const;
> +    uint8_t getVectorSize() const;
> +  };
> +
> +  /*! Media Block Write.  */
> +  class MediaBlockWriteInstruction : public Instruction {
> +  public:
> +    /*! Return true if the given instruction is an instance of this class */
> +    static bool isClassOf(const Instruction &insn);
> +    uint8_t getImageIndex() const;
> +    uint8_t getVectorSize() const;
> +  };
> +
>    /*! Specialize the instruction. Also performs typechecking first based on the
>     *  opcode. Crashes if it fails
>     */
> @@ -867,6 +885,10 @@ namespace ir {
>    Instruction SUBGROUP(WorkGroupOps opcode, Register dst, Tuple
> srcTuple, uint8_t srcNum, Type type);
>    /*! printf */
>    Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t
> srcNum, uint8_t bti, uint16_t num);
> +  /*! media block read */
> +  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size,
> Tuple coord, uint8_t srcNum);
> +  /*! media block write */
> +  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum,
> uint8_t vec_size);
>  } /* namespace ir */
>  } /* namespace gbe */
> 
> diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
> index 57e13eb..7d755ae 100644
> --- a/backend/src/ir/instruction.hxx
> +++ b/backend/src/ir/instruction.hxx
> @@ -114,3 +114,5 @@ DECL_INSN(WAIT, WaitInstruction)
>  DECL_INSN(WORKGROUP, WorkGroupInstruction)
>  DECL_INSN(SUBGROUP, SubGroupInstruction)
>  DECL_INSN(PRINTF, PrintfInstruction)
> +DECL_INSN(MBREAD, MediaBlockReadInstruction)
> +DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
> diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
> index 3162d13..43d4c87 100644
> --- a/backend/src/ir/liveness.cpp
> +++ b/backend/src/ir/liveness.cpp
> @@ -118,7 +118,8 @@ namespace ir {
>            uniform = false;
> 
>          // do not change dst uniform for block read
> -        if (insn.getOpcode() == ir::OP_LOAD &&
> ir::cast<ir::LoadInstruction>(insn).isBlock())
> +        if ((insn.getOpcode() == ir::OP_LOAD &&
> ir::cast<ir::LoadInstruction>(insn).isBlock()) ||
> +            insn.getOpcode() == ir::OP_MBREAD)
>            uniform = false;
> 
>          for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
> diff --git a/backend/src/libocl/src/ocl_substore.ll
> b/backend/src/libocl/src/ocl_substore.ll
> index 665cdfa..f6c2c70 100644
> --- a/backend/src/libocl/src/ocl_substore.ll
> +++ b/backend/src/libocl/src/ocl_substore.ll
> @@ -1,9 +1,42 @@
>  target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-
> v192:256-v256:256-v512:512-v1024:1024"
>  target triple = "spir"
> +%opencl.image2d_t = type opaque
> 
>  declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)*
> nocapture, i32) nounwind alwaysinline noduplicate
> +declare void
> @__gen_ocl_sub_group_block_write_image(%opencl.image2d_t
> addrspace(1)*, i32, i32, i32) nounwind alwaysinline noduplicate
> +declare void
> @__gen_ocl_sub_group_block_write_image2(%opencl.image2d_t
> addrspace(1)*, i32, i32, <2 x i32>) nounwind alwaysinline noduplicate
> +declare void
> @__gen_ocl_sub_group_block_write_image4(%opencl.image2d_t
> addrspace(1)*, i32, i32, <4 x i32>) nounwind alwaysinline noduplicate
> +declare void
> @__gen_ocl_sub_group_block_write_image8(%opencl.image2d_t
> addrspace(1)*, i32, i32, <8 x i32>) nounwind alwaysinline noduplicate
> 
>  define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32
> addrspace(1)* %p, i32 %data) nounwind alwaysinline noduplicate {
>    call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p,
> i32 %data)
>    ret void
>  }
> +
> +define void
> @_Z27intel_sub_group_block_write11ocl_image2dDv2_ij(%opencl.image2d
> _t addrspace(1)* %image, <2 x i32> %byte_coord, i32 %data) nounwind
> alwaysinline noduplicate {
> +  %1 = extractelement <2 x i32> %byte_coord, i32 0
> +  %2 = extractelement <2 x i32> %byte_coord, i32 1
> +  call void @__gen_ocl_sub_group_block_write_image(%opencl.image2d_t
> addrspace(1)* %image, i32 %1, i32 %2, i32 %data)
> +  ret void
> +}
> +
> +define void
> @_Z28intel_sub_group_block_write211ocl_image2dDv2_iDv2_j(%opencl.im
> age2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <2 x i32> %data)
> nounwind alwaysinline noduplicate {
> +  %1 = extractelement <2 x i32> %byte_coord, i32 0
> +  %2 = extractelement <2 x i32> %byte_coord, i32 1
> +  call void
> @__gen_ocl_sub_group_block_write_image2(%opencl.image2d_t
> addrspace(1)* %image, i32 %1, i32 %2, <2 x i32> %data)
> +  ret void
> +}
> +
> +define void
> @_Z28intel_sub_group_block_write411ocl_image2dDv2_iDv4_j(%opencl.im
> age2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <4 x i32> %data)
> nounwind alwaysinline noduplicate {
> +  %1 = extractelement <2 x i32> %byte_coord, i32 0
> +  %2 = extractelement <2 x i32> %byte_coord, i32 1
> +  call void
> @__gen_ocl_sub_group_block_write_image4(%opencl.image2d_t
> addrspace(1)* %image, i32 %1, i32 %2, <4 x i32> %data)
> +  ret void
> +}
> +
> +define void
> @_Z28intel_sub_group_block_write811ocl_image2dDv2_iDv8_j(%opencl.im
> age2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <8 x i32> %data)
> nounwind alwaysinline noduplicate {
> +  %1 = extractelement <2 x i32> %byte_coord, i32 0
> +  %2 = extractelement <2 x i32> %byte_coord, i32 1
> +  call void
> @__gen_ocl_sub_group_block_write_image8(%opencl.image2d_t
> addrspace(1)* %image, i32 %1, i32 %2, <8 x i32> %data)
> +  ret void
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> index 66490cc..753a045 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
> @@ -187,3 +187,24 @@ OVERLOADABLE void
> intel_sub_group_block_write8(const global uint* p,uint8 data)
>    intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
>    intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
>  }
> +
> +PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p,
> int x, int y);
> +PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p,
> int x, int y);
> +PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p,
> int x, int y);
> +PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p,
> int x, int y);
> +OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
> +{
> +  return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
> +}
> +OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
> +{
> +  return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
> +}
> +OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
> +{
> +  return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
> +}
> +OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
> +{
> +  return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
> +}
> diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> index d0676be..799f772 100644
> --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
> @@ -143,3 +143,13 @@ OVERLOADABLE void
> intel_sub_group_block_write(const __global uint* p, uint data)
>  OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p,
> uint2 data);
>  OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p,
> uint4 data);
>  OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p,
> uint8 data);
> +
> +OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2
> byte_coord);
> +OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2
> byte_coord);
> +OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t image, int2
> byte_coord);
> +OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t image, int2
> byte_coord);
> +
> +OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2
> byte_coord, uint data);
> +OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2
> byte_coord, uint2 data);
> +OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2
> byte_coord, uint4 data);
> +OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2
> byte_coord, uint8 data);
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index ffa838c..2dcf308 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -699,6 +699,7 @@ namespace gbe
>      void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps
> opcode);
>      // Emit subgroup instructions
>      void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite);
> +    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite,
> uint8_t vec_size);
> 
>      uint8_t appendSampler(CallSite::arg_iterator AI);
>      uint8_t getImageID(CallInst &I);
> @@ -3744,10 +3745,12 @@ namespace gbe
>        case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
>        case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
>        case GEN_OCL_LRP:
> -        this->newRegister(&I);
> -        break;
>        case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
> -        this->newRegister(&I, NULL, false);
> +      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
> +      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
> +      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
> +      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
> +        this->newRegister(&I);
>          break;
>        case GEN_OCL_PRINTF:
>          this->newRegister(&I);  // fall through
> @@ -3764,6 +3767,10 @@ namespace gbe
>        case GEN_OCL_STORE_PROFILING:
>        case GEN_OCL_DEBUGWAIT:
>        case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
> +      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
> +      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
> +      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
> +      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
>          break;
>        case GEN_OCL_NOT_FOUND:
>        default:
> @@ -4013,6 +4020,39 @@ namespace gbe
>      GBE_ASSERT(AI == AE);
>    }
> 
> +  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS,
> bool isWrite, uint8_t vec_size) {
> +    CallSite::arg_iterator AI = CS.arg_begin();
> +    CallSite::arg_iterator AE = CS.arg_end();
> +    GBE_ASSERT(AI != AE);
> +
> +    const uint8_t imageID = getImageID(I);
> +    AI++;
> +
> +    if(isWrite){
> +      vector<ir::Register> srcTupleData;
> +      srcTupleData.push_back(getRegister(*(AI++)));
> +      srcTupleData.push_back(getRegister(*(AI++)));
> +      for(int i = 0;i < vec_size; i++)
> +        srcTupleData.push_back(getRegister(*(AI), i));
> +      AI++;
> +      const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
> +      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
> +    } else {
> +      ir::Register src[2];
> +      src[0] = getRegister(*(AI++));
> +      src[1] = getRegister(*(AI++));
> +      vector<ir::Register> dstTupleData;
> +      for(int i = 0;i < vec_size; i++)
> +        dstTupleData.push_back(getRegister(&I, i));
> +      const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
> +      const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
> +      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
> +    }
> +
> +    GBE_ASSERT(AI == AE);
> +  }
> +
> +
>    /* append a new sampler. should be called before any reference to
>     * a sampler_t value. */
>    uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
> @@ -4841,6 +4881,22 @@ namespace gbe
>              this->emitBlockReadWriteMemInst(I, CS, false); break;
>            case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
>              this->emitBlockReadWriteMemInst(I, CS, true); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
> +            this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
> +            this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
> +            this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
> +            this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
> +            this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
> +            this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
> +            this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
> +            this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
>            default: break;
>          }
>        }
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 003be91..456ab58 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -219,6 +219,14 @@
> DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN,
> __gen_ocl_sub_group_scan_in
> 
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM,
> __gen_ocl_sub_group_block_read_mem)
>  DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM,
> __gen_ocl_sub_group_block_write_mem)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE,
> __gen_ocl_sub_group_block_read_image)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2,
> __gen_ocl_sub_group_block_read_image2)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4,
> __gen_ocl_sub_group_block_read_image4)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8,
> __gen_ocl_sub_group_block_read_image8)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE,
> __gen_ocl_sub_group_block_write_image)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2,
> __gen_ocl_sub_group_block_write_image2)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4,
> __gen_ocl_sub_group_block_write_image4)
> +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8,
> __gen_ocl_sub_group_block_write_image8)
> 
>  // common function
>  DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
> diff --git a/backend/src/llvm/llvm_scalarize.cpp
> b/backend/src/llvm/llvm_scalarize.cpp
> index 53fd320..e60bf4b 100644
> --- a/backend/src/llvm/llvm_scalarize.cpp
> +++ b/backend/src/llvm/llvm_scalarize.cpp
> @@ -682,7 +682,21 @@ namespace gbe {
>              *CI = InsertToVector(call, *CI);
>              break;
>            }
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
> +          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
> +          {
> +            ++CI;
> +            ++CI;
> +            if ((*CI)->getType()->isVectorTy())
> +              *CI = InsertToVector(call, *CI);
> +            break;
> +          }
>            case GEN_OCL_VME:
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
> +          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
>              setAppendPoint(call);
>              extractFromVector(call);
>              break;
> --
> 2.7.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list