[Beignet] [PATCH V3 11/12] Backend: Add intel_sub_group_block_read/write form image

Xiuli Pan xiuli.pan at intel.com
Sat Jun 11 21:32:43 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Using meida block read/write to read data in block. In simd16 mode the
need some reg relocation for later use.
GEN7 has some different data port.
V2: Refine block read simd16 with tmp reg to avoide MOVs
V3: Fix build bug with clang.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c          |  27 +++-
 backend/src/backend/gen7_encoder.cpp               |  48 +++++++
 backend/src/backend/gen7_encoder.hpp               |   4 +
 backend/src/backend/gen7_instruction.hpp           |  16 +++
 backend/src/backend/gen8_instruction.hpp           |  16 +++
 backend/src/backend/gen_context.cpp                | 155 +++++++++++++++++++++
 backend/src/backend/gen_context.hpp                |   2 +
 backend/src/backend/gen_defs.hpp                   |  16 +++
 backend/src/backend/gen_encoder.cpp                |  47 +++++++
 backend/src/backend/gen_encoder.hpp                |   4 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   2 +
 backend/src/backend/gen_insn_selection.cpp         | 115 ++++++++++++++-
 backend/src/backend/gen_insn_selection.hpp         |   4 +
 backend/src/backend/gen_insn_selection.hxx         |   2 +
 backend/src/ir/instruction.cpp                     | 112 ++++++++++++++-
 backend/src/ir/instruction.hpp                     |  22 +++
 backend/src/ir/instruction.hxx                     |   2 +
 backend/src/ir/liveness.cpp                        |   3 +-
 backend/src/libocl/src/ocl_substore.ll             |  33 +++++
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |  21 +++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            |  10 ++
 backend/src/llvm/llvm_gen_backend.cpp              |  62 ++++++++-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   8 ++
 backend/src/llvm/llvm_scalarize.cpp                |  14 ++
 24 files changed, 732 insertions(+), 13 deletions(-)

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 9200c26..9955dfc 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -1476,6 +1476,15 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
                  SAMPLER_MSG_TYPE(inst),
                  SAMPLER_SIMD_MODE(inst));
           break;
+        case GEN_SFID_DATAPORT_RENDER:
+            if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 10)
+              format(file, " (bti: %d, %s, %s)",
+                     UNTYPED_RW_BTI(inst),
+                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else
+              format(file, " not implemented");
+            break;
         case GEN_SFID_DATAPORT_DATA:
           if(UNTYPED_RW_CATEGORY(inst) == 0) {
             if(UNTYPED_RW_MSG_TYPE(inst) == 5 || UNTYPED_RW_MSG_TYPE(inst) == 13)
@@ -1510,12 +1519,18 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
           }
           break;
         case GEN_SFID_DATAPORT1_DATA:
-          format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                 UNTYPED_RW_BTI(inst),
-                 UNTYPED_RW_RGBA(inst),
-                 data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
-                 data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
-                 data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            if(UNTYPED_RW_MSG_TYPE(inst) == 4 || UNTYPED_RW_MSG_TYPE(inst) == 10)
+              format(file, " (bti: %d, %s, %s)",
+                     UNTYPED_RW_BTI(inst),
+                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+            else
+              format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                     UNTYPED_RW_BTI(inst),
+                     UNTYPED_RW_RGBA(inst),
+                     data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                     data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                     data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
           break;
         case GEN_SFID_DATAPORT_CONSTANT:
           format(file, " (bti: %d, %s)",
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index fc358be..abb8b77 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -239,5 +239,53 @@ namespace gbe
      }
   }
 
+  static void setMBlockRWGEN7(GenEncoder *p,
+                          GenNativeInstruction *insn,
+                          uint32_t bti,
+                          uint32_t msg_type,
+                          uint32_t msg_length,
+                          uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_RENDER;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_mblock_rw.msg_type = msg_type;
+    insn->bits3.gen7_mblock_rw.bti = bti;
+    insn->bits3.gen7_mblock_rw.header_present = 1;
+  }
+
+
+  void Gen7Encoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    const uint32_t response_length = size; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRWGEN7(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_BREAD,
+                msg_length,
+                response_length);
+  }
+
+  void Gen7Encoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1 + size;
+    const uint32_t response_length = 0; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRWGEN7(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_TYPED_BWRITE,
+                msg_length,
+                response_length);
+  }
+
+
 #undef NO_SWIZZLE
 }
diff --git a/backend/src/backend/gen7_encoder.hpp b/backend/src/backend/gen7_encoder.hpp
index 1276c67..edb711d 100644
--- a/backend/src/backend/gen7_encoder.hpp
+++ b/backend/src/backend/gen7_encoder.hpp
@@ -42,6 +42,10 @@ namespace gbe
     virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
     virtual void alu3(uint32_t opcode, GenRegister dst,
                        GenRegister src0, GenRegister src1, GenRegister src2);
+    /*! MBlock read */
+    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! MBlock write */
+    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
   };
 }
 #endif /* __GBE_GEN7_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
index 258dd24..7d7eada 100644
--- a/backend/src/backend/gen7_instruction.hpp
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -531,6 +531,22 @@ union Gen7NativeInstruction
         uint32_t uip:16;
       } gen7_branch;
 
+      /*! Data port Media block read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t ver_line_stride_offset:1;
+        uint32_t ver_line_stride:1;
+        uint32_t ver_line_stride_override:1;
+        uint32_t ignored:3;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_mblock_rw;
+
       int d;
       uint32_t ud;
       float f;
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index ada9ffc..549948a 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -608,6 +608,22 @@ union Gen8NativeInstruction
         uint32_t jip:32;
       } gen8_branch;
 
+      /*! Data port Media block read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t ver_line_stride_offset:1;
+        uint32_t ver_line_stride:1;
+        uint32_t ver_line_stride_override:1;
+        uint32_t ignored:3;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_mblock_rw;
+
       int d;
       uint32_t ud;
       float f;
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 90b8b45..98a94ba 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3538,6 +3538,161 @@ namespace gbe
     p->OBWRITE(header, insn.getbti(), insn.extra.elem);
   }
 
+  void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)),GEN_TYPE_D);
+    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)),GEN_TYPE_D);
+    GenRegister header, offsetx, offsety, blocksizereg;
+    if (simdWidth == 8)
+      header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    else
+      header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(2)),1), GEN_TYPE_UD);
+
+    offsetx = GenRegister::offset(header, 0, 0*4);
+    offsety = GenRegister::offset(header, 0, 1*4);
+    blocksizereg = GenRegister::offset(header, 0, 2*4);
+    size_t vec_size = insn.extra.elem;
+    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+
+    if (simdWidth == 8)
+    {
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        // Now read the data
+        p->curr.execWidth = 8;
+        p->MBREAD(dst, header, insn.getbti(), vec_size);
+      p->pop();
+
+    }
+    else
+    {
+      const GenRegister tmp = ra->genReg(insn.dst(vec_size));
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // First half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        // Now read the data
+        p->curr.execWidth = 8;
+        p->MBREAD(tmp, header, insn.getbti(), vec_size);
+
+        // Second half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+        const GenRegister tmp2 = GenRegister::offset(tmp, vec_size);
+        // Now read the data
+        p->curr.execWidth = 8;
+        p->MBREAD(tmp2, header, insn.getbti(), vec_size);
+
+        // Move the reg to fit vector rule.
+        for (int i = 0; i < vec_size; i++) {
+          p->MOV(GenRegister::offset(dst, i * 2), GenRegister::offset(tmp, i));
+          p->MOV(GenRegister::offset(dst, i * 2 + 1),
+                 GenRegister::offset(tmp2, i));
+        }
+      p->pop();
+    }
+  }
+
+  void GenContext::emitMBWriteInstruction(const SelectionInstruction &insn) {
+    const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
+    const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
+    GenRegister header, offsetx, offsety, blocksizereg;
+    size_t vec_size = insn.extra.elem;
+    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+    if (simdWidth == 8)
+      header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+    else
+      header = GenRegister::retype(GenRegister::Qn(ra->genReg(insn.dst(0)),1), GEN_TYPE_UD);
+
+    offsetx = GenRegister::offset(header, 0, 0*4);
+    offsety = GenRegister::offset(header, 0, 1*4);
+    blocksizereg = GenRegister::offset(header, 0, 2*4);
+
+    if (simdWidth == 8)
+    {
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        p->curr.execWidth = 8;
+        // Mov what we need into msgs
+        for(uint32_t i = 0; i < vec_size; i++)
+          p->MOV(ra->genReg(insn.dst(1 + i)), ra->genReg(insn.src(2 + i)));
+        // Now read the data
+        p->MBWRITE(header, insn.getbti(), vec_size);
+      p->pop();
+
+    }
+    else
+    {
+      p->push();
+        // Copy r0 into the header first
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        p->MOV(header, GenRegister::ud8grf(0,0));
+
+        // First half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->MOV(offsetx, coordx);
+        p->MOV(offsety, coordy);
+        // Update block width and height
+        p->MOV(blocksizereg, GenRegister::immud(blocksize));
+        // Now read the data
+        p->curr.execWidth = 8;
+        // Mov what we need into msgs
+        for(uint32_t i = 0; i < vec_size; i++)
+          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
+        p->MBWRITE(header, insn.getbti(), vec_size);
+
+        // Second half
+        // Update the header with the coord
+        p->curr.execWidth = 1;
+        p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+        p->curr.execWidth = 8;
+        // Mov what we need into msgs
+        for(uint32_t i = 0; i < vec_size; i++)
+          p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
+        // Now write the data
+        p->MBWRITE(header, insn.getbti(), vec_size);
+
+      p->pop();
+    }
+  }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index a634338..fb3d4fe 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -189,6 +189,8 @@ namespace gbe
     void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, GenRegister btiTmp, unsigned jip0);
     void emitOBReadInstruction(const SelectionInstruction &insn);
     void emitOBWriteInstruction(const SelectionInstruction &insn);
+    void emitMBReadInstruction(const SelectionInstruction &insn);
+    void emitMBWriteInstruction(const SelectionInstruction &insn);
 
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 09cb2ba..66ae5b5 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -784,6 +784,22 @@ union GenNativeInstruction
         uint32_t jip:32;
       } gen8_branch;
 
+      /*! Data port Media block read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t ver_line_stride_offset:1;
+        uint32_t ver_line_stride:1;
+        uint32_t ver_line_stride_override:1;
+        uint32_t ignored:3;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_mblock_rw;
+
       int d;
       uint32_t ud;
       float f;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index e745b9c..eb9fbeb 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -276,6 +276,21 @@ namespace gbe
     insn->bits3.gen7_oblock_rw.header_present = 1;
   }
 
+  static void setMBlockRW(GenEncoder *p,
+                          GenNativeInstruction *insn,
+                          uint32_t bti,
+                          uint32_t msg_type,
+                          uint32_t msg_length,
+                          uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_mblock_rw.msg_type = msg_type;
+    insn->bits3.gen7_mblock_rw.bti = bti;
+    insn->bits3.gen7_mblock_rw.header_present = 1;
+  }
+
+
   static void setDWordScatterMessgae(GenEncoder *p,
                                      GenNativeInstruction *insn,
                                      uint32_t bti,
@@ -1277,6 +1292,38 @@ namespace gbe
                 response_length);
   }
 
+  void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1;
+    const uint32_t response_length = size; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::ud8grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRW(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_BREAD,
+                msg_length,
+                response_length);
+  }
+
+  void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    const uint32_t msg_length = 1 + size;
+    const uint32_t response_length = 0; // Size of registers
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+    this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setMBlockRW(this,
+                insn,
+                bti,
+                GEN75_P1_MEDIA_TYPED_BWRITE,
+                msg_length,
+                response_length);
+  }
+
   void GenEncoder::EOT(uint32_t msg) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index a53c879..4979305 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -271,6 +271,10 @@ namespace gbe
     void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
     /*! OBlock write */
     void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! MBlock read */
+    virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t elemSize);
+    /*! MBlock write */
+    virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize);
 
     GBE_CLASS(GenEncoder); //!< Use custom allocators
     virtual void alu3(uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index d297726..c396626 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -52,3 +52,5 @@ DECL_GEN7_SCHEDULE(SubGroupOp,      80,        1,        1)
 DECL_GEN7_SCHEDULE(Printf,          80,        1,        1)
 DECL_GEN7_SCHEDULE(OBRead,          80,        1,        1)
 DECL_GEN7_SCHEDULE(OBWrite,         80,        1,        1)
+DECL_GEN7_SCHEDULE(MBRead,          80,        1,        1)
+DECL_GEN7_SCHEDULE(MBWrite,         80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index e974e97..d3c5a40c 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -189,7 +189,8 @@ namespace gbe
            this->opcode == SEL_OP_SAMPLE ||
            this->opcode == SEL_OP_VME ||
            this->opcode == SEL_OP_DWORD_GATHER ||
-           this->opcode == SEL_OP_OBREAD;
+           this->opcode == SEL_OP_OBREAD ||
+           this->opcode == SEL_OP_MBREAD;
   }
 
   bool SelectionInstruction::modAcc(void) const {
@@ -212,7 +213,8 @@ namespace gbe
            this->opcode == SEL_OP_ATOMIC        ||
            this->opcode == SEL_OP_BYTE_SCATTER  ||
            this->opcode == SEL_OP_TYPED_WRITE ||
-           this->opcode == SEL_OP_OBWRITE;
+           this->opcode == SEL_OP_OBWRITE ||
+           this->opcode == SEL_OP_MBWRITE;
   }
 
   bool SelectionInstruction::isBranch(void) const {
@@ -703,6 +705,10 @@ namespace gbe
     void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, uint32_t bti, uint32_t size);
     /*! Oblock write */
     void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, uint32_t bti, uint32_t size);
+    /*! Media block read */
+    void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
+    /*! Media block write */
+    void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t vec_size);
 
     /* common functions for both binary instruction and sel_cmp and compare instruction.
        It will handle the IMM or normal register assignment, and will try to avoid LOADI
@@ -2055,6 +2061,63 @@ namespace gbe
     vector->isSrc = 1;
   }
 
+  void Selection::Opaque::MBREAD(GenRegister* dsts,
+                                 GenRegister coordx,
+                                 GenRegister coordy,
+                                 GenRegister header,
+                                 GenRegister* tmp,
+                                 uint32_t bti,
+                                 uint32_t vec_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * 2, 3);
+    SelectionVector *vector = this->appendVector();
+    SelectionVector *vectortmp = this->appendVector();
+    for (uint32_t i = 0; i < vec_size; ++i) {
+      insn->dst(i) = dsts[i];
+      insn->dst(i + vec_size) = tmp[i];
+    }
+    insn->src(0) = coordx;
+    insn->src(1) = coordy;
+    insn->src(2) = header;
+    insn->setbti(bti);
+    insn->extra.elem = vec_size; // vector size
+
+    vector->regNum = vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
+    vectortmp->regNum = vec_size;
+    vectortmp->reg = &insn->dst(vec_size);
+    vectortmp->offsetID = 0;
+    vectortmp->isSrc = 0;
+
+  }
+
+  void Selection::Opaque::MBWRITE(GenRegister coordx,
+                                  GenRegister coordy,
+                                  GenRegister* values,
+                                  GenRegister header,
+                                  GenRegister* tmp,
+                                  uint32_t bti,
+                                  uint32_t vec_size) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + vec_size, 2 + vec_size);
+    SelectionVector *vector = this->appendVector();
+    insn->src(0) = coordx;
+    insn->src(1) = coordy;
+    for (uint32_t i = 0; i < vec_size; ++i)
+      insn->src(2 + i) = values[i];
+    insn->dst(0) = header;
+    for (uint32_t i = 0; i < vec_size; ++i)
+      insn->dst(1 + i) = tmp[i];
+    insn->state = this->curr;
+    insn->setbti(bti);
+    insn->extra.elem = vec_size; // vector size
+
+    // We need to put the header and the data together
+    vector->regNum = 1 + vec_size;
+    vector->reg = &insn->dst(0);
+    vector->offsetID = 0;
+    vector->isSrc = 0;
+  }
 
   // Boiler plate to initialize the selection library at c++ pre-main
   static SelectionLibrary *selLib = NULL;
@@ -6583,6 +6646,52 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     }
   };
 
+  /*! Media Block Read pattern */
+  DECL_PATTERN(MediaBlockReadInstruction)
+  {
+    bool emitOne(Selection::Opaque &sel, const ir::MediaBlockReadInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t vec_size = insn.getVectorSize();
+      vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
+      for (uint32_t i = 0; i < vec_size; ++i) {
+        valuesVec.push_back(sel.selReg(insn.getSrc(i), TYPE_U32));
+        tmpVec.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+      }
+      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
+      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      sel.MBREAD(values, coordx, coordy, header, tmp, insn.getImageIndex(), insn.getVectorSize());
+      return true;
+    }
+    DECL_CTOR(MediaBlockReadInstruction, 1, 1);
+  };
+
+  /*! Media Block Write pattern */
+  DECL_PATTERN(MediaBlockWriteInstruction)
+  {
+    bool emitOne(Selection::Opaque &sel, const ir::MediaBlockWriteInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t vec_size = insn.getVectorSize();
+      const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
+      const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
+      vector<GenRegister> valuesVec;
+      vector<GenRegister> tmpVec;
+      for(uint32_t i = 0; i < vec_size; i++)
+      {
+        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
+        tmpVec.push_back(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32));
+      }
+      const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
+      return true;
+    }
+    DECL_CTOR(MediaBlockWriteInstruction, 1, 1);
+  };
+
+
   /*! Sort patterns */
   INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
     if (p0->insnNum != p1->insnNum)
@@ -6624,6 +6733,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     this->insert<NullaryInstructionPattern>();
     this->insert<WaitInstructionPattern>();
     this->insert<PrintfInstructionPattern>();
+    this->insert<MediaBlockReadInstructionPattern>();
+    this->insert<MediaBlockWriteInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 51af686..b481de8 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -177,6 +177,8 @@ namespace gbe
       switch (opcode) {
         case SEL_OP_OBREAD:
         case SEL_OP_OBWRITE:
+        case SEL_OP_MBREAD:
+        case SEL_OP_MBWRITE:
         case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
         case SEL_OP_VME: return extra.vme_bti;
@@ -192,6 +194,8 @@ namespace gbe
       switch (opcode) {
         case SEL_OP_OBREAD:
         case SEL_OP_OBWRITE:
+        case SEL_OP_MBREAD:
+        case SEL_OP_MBWRITE:
         case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
         case SEL_OP_VME: extra.vme_bti = bti; return;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 4a7caff..ccaf526 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -98,3 +98,5 @@ DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction)
 DECL_SELECTION_IR(PRINTF, PrintfInstruction)
 DECL_SELECTION_IR(OBREAD, OBReadInstruction)
 DECL_SELECTION_IR(OBWRITE, OBWriteInstruction)
+DECL_SELECTION_IR(MBREAD, MBReadInstruction)
+DECL_SELECTION_IR(MBWRITE, MBWriteInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 88491a7..ed64580 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1064,6 +1064,78 @@ namespace ir {
         Register dst[1];
     };
 
+    class ALIGNED_INSTRUCTION MediaBlockReadInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<MediaBlockReadInstruction>,
+      public TupleDstPolicy<MediaBlockReadInstruction>
+    {
+    public:
+      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) {
+        this->opcode = OP_MBREAD;
+        this->dst = dst;
+        this->dstNum = vec_size;
+        this->src = srcTuple;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << (int)this->getVectorSize();
+        out << " {";
+        for (uint32_t i = 0; i < dstNum; ++i)
+          out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
+        out << "}";
+        out << " 2D surface id " << (int)this->getImageIndex()
+            << " byte coord x %" << this->getSrc(fn, 0)
+            << " row coord y %" << this->getSrc(fn, 1);
+      }
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
+
+      Tuple src;
+      Tuple dst;
+      uint8_t imageIdx;
+      uint8_t srcNum;
+      uint8_t dstNum;
+    };
+
+    class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<MediaBlockWriteInstruction>,
+      public NDstPolicy<MediaBlockWriteInstruction, 0>
+    {
+    public:
+
+      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+        this->opcode = OP_MBWRITE;
+        this->src = srcTuple;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+        this->vec_size = vec_size;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << (int)this->getVectorSize()
+            << " 2D surface id " << (int)this->getImageIndex()
+            << " byte coord x %" << this->getSrc(fn, 0)
+            << " row coord y %" << this->getSrc(fn, 1);
+        out << " {";
+        for (uint32_t i = 0; i < vec_size; ++i)
+          out << "%" << this->getSrc(fn, i + 2) << (i != (vec_size-1u) ? " " : "");
+        out << "}";
+      }
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
+
+      Tuple src;
+      Register dst[0];
+      uint8_t imageIdx;
+      uint8_t srcNum;
+      uint8_t vec_size;
+    };
+
 #undef ALIGNED_INSTRUCTION
 
     /////////////////////////////////////////////////////////////////////////
@@ -1591,6 +1663,22 @@ namespace ir {
       return true;
     }
 
+    INLINE bool MediaBlockReadInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (this->srcNum != 2) {
+        whyNot = "Wrong number of source.";
+        return false;
+      }
+      return true;
+    }
+
+    INLINE bool MediaBlockWriteInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (this->srcNum != 2 + this->vec_size) {
+        whyNot = "Wrong number of source.";
+        return false;
+      }
+      return true;
+    }
+
 #undef CHECK_TYPE
 
     /////////////////////////////////////////////////////////////////////////
@@ -2058,6 +2146,14 @@ START_INTROSPECTION(PrintfInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(PrintfInstruction)
 
+START_INTROSPECTION(MediaBlockReadInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(MediaBlockReadInstruction)
+
+START_INTROSPECTION(MediaBlockWriteInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(MediaBlockWriteInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -2205,7 +2301,8 @@ END_FUNCTION(Instruction, Register)
            opcode == OP_CALC_TIMESTAMP ||
            opcode == OP_STORE_PROFILING ||
            opcode == OP_WAIT ||
-           opcode == OP_PRINTF;
+           opcode == OP_PRINTF ||
+           opcode == OP_MBWRITE;
   }
 
 #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
@@ -2275,6 +2372,10 @@ DECL_MEM_FN(SubGroupInstruction, WorkGroupOps, getWorkGroupOpcode(void), getWork
 DECL_MEM_FN(PrintfInstruction, uint32_t, getNum(void), getNum())
 DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
 DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID))
+DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
 
 #undef DECL_MEM_FN
 
@@ -2582,6 +2683,15 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
     return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert();
   }
 
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) {
+    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert();
+  }
+
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
+  }
+
+
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
     const BasicBlock *bb = insn.getParent();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 4e7d5b7..b2b0b49 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -635,6 +635,24 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Media Block Read.  */
+  class MediaBlockReadInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    uint8_t getImageIndex() const;
+    uint8_t getVectorSize() const;
+  };
+
+  /*! Media Block Write.  */
+  class MediaBlockWriteInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    uint8_t getImageIndex() const;
+    uint8_t getVectorSize() const;
+  };
+
   /*! Specialize the instruction. Also performs typechecking first based on the
    *  opcode. Crashes if it fails
    */
@@ -867,6 +885,10 @@ namespace ir {
   Instruction SUBGROUP(WorkGroupOps opcode, Register dst, Tuple srcTuple, uint8_t srcNum, Type type);
   /*! printf */
   Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num);
+  /*! media block read */
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
+  /*! media block write */
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 57e13eb..7d755ae 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -114,3 +114,5 @@ DECL_INSN(WAIT, WaitInstruction)
 DECL_INSN(WORKGROUP, WorkGroupInstruction)
 DECL_INSN(SUBGROUP, SubGroupInstruction)
 DECL_INSN(PRINTF, PrintfInstruction)
+DECL_INSN(MBREAD, MediaBlockReadInstruction)
+DECL_INSN(MBWRITE, MediaBlockWriteInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index 3162d13..43d4c87 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -118,7 +118,8 @@ namespace ir {
           uniform = false;
 
         // do not change dst uniform for block read
-        if (insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock())
+        if ((insn.getOpcode() == ir::OP_LOAD && ir::cast<ir::LoadInstruction>(insn).isBlock()) ||
+            insn.getOpcode() == ir::OP_MBREAD)
           uniform = false;
 
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
diff --git a/backend/src/libocl/src/ocl_substore.ll b/backend/src/libocl/src/ocl_substore.ll
index 665cdfa..f6c2c70 100644
--- a/backend/src/libocl/src/ocl_substore.ll
+++ b/backend/src/libocl/src/ocl_substore.ll
@@ -1,9 +1,42 @@
 target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir"
+%opencl.image2d_t = type opaque
 
 declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* nocapture, i32) nounwind alwaysinline noduplicate
+declare void @__gen_ocl_sub_group_block_write_image(%opencl.image2d_t addrspace(1)*, i32, i32, i32) nounwind alwaysinline noduplicate
+declare void @__gen_ocl_sub_group_block_write_image2(%opencl.image2d_t addrspace(1)*, i32, i32, <2 x i32>) nounwind alwaysinline noduplicate
+declare void @__gen_ocl_sub_group_block_write_image4(%opencl.image2d_t addrspace(1)*, i32, i32, <4 x i32>) nounwind alwaysinline noduplicate
+declare void @__gen_ocl_sub_group_block_write_image8(%opencl.image2d_t addrspace(1)*, i32, i32, <8 x i32>) nounwind alwaysinline noduplicate
 
 define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)* %p, i32 %data) nounwind alwaysinline noduplicate {
   call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p, i32 %data)
   ret void
 }
+
+define void @_Z27intel_sub_group_block_write11ocl_image2dDv2_ij(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, i32 %data) nounwind alwaysinline noduplicate {
+  %1 = extractelement <2 x i32> %byte_coord, i32 0
+  %2 = extractelement <2 x i32> %byte_coord, i32 1
+  call void @__gen_ocl_sub_group_block_write_image(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, i32 %data)
+  ret void
+}
+
+define void @_Z28intel_sub_group_block_write211ocl_image2dDv2_iDv2_j(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <2 x i32> %data) nounwind alwaysinline noduplicate {
+  %1 = extractelement <2 x i32> %byte_coord, i32 0
+  %2 = extractelement <2 x i32> %byte_coord, i32 1
+  call void @__gen_ocl_sub_group_block_write_image2(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, <2 x i32> %data)
+  ret void
+}
+
+define void @_Z28intel_sub_group_block_write411ocl_image2dDv2_iDv4_j(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <4 x i32> %data) nounwind alwaysinline noduplicate {
+  %1 = extractelement <2 x i32> %byte_coord, i32 0
+  %2 = extractelement <2 x i32> %byte_coord, i32 1
+  call void @__gen_ocl_sub_group_block_write_image4(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, <4 x i32> %data)
+  ret void
+}
+
+define void @_Z28intel_sub_group_block_write811ocl_image2dDv2_iDv8_j(%opencl.image2d_t addrspace(1)* %image, <2 x i32> %byte_coord, <8 x i32> %data) nounwind alwaysinline noduplicate {
+  %1 = extractelement <2 x i32> %byte_coord, i32 0
+  %2 = extractelement <2 x i32> %byte_coord, i32 1
+  call void @__gen_ocl_sub_group_block_write_image8(%opencl.image2d_t addrspace(1)* %image, i32 %1, i32 %2, <8 x i32> %data)
+  ret void
+}
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 66490cc..753a045 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -187,3 +187,24 @@ OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
   intel_sub_group_block_write(p + get_simd_size() * 6, data.s6);
   intel_sub_group_block_write(p + get_simd_size() * 7, data.s7);
 }
+
+PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y);
+OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
+}
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index d0676be..799f772 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -143,3 +143,13 @@ OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data)
 OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
 OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
 OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t image, int2 byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index ffa838c..2dcf308 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -699,6 +699,7 @@ namespace gbe
     void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
     // Emit subgroup instructions
     void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite);
+    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
 
     uint8_t appendSampler(CallSite::arg_iterator AI);
     uint8_t getImageID(CallInst &I);
@@ -3744,10 +3745,12 @@ namespace gbe
       case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
       case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
       case GEN_OCL_LRP:
-        this->newRegister(&I);
-        break;
       case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
-        this->newRegister(&I, NULL, false);
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+        this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
         this->newRegister(&I);  // fall through
@@ -3764,6 +3767,10 @@ namespace gbe
       case GEN_OCL_STORE_PROFILING:
       case GEN_OCL_DEBUGWAIT:
       case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
         break;
       case GEN_OCL_NOT_FOUND:
       default:
@@ -4013,6 +4020,39 @@ namespace gbe
     GBE_ASSERT(AI == AE);
   }
 
+  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    const uint8_t imageID = getImageID(I);
+    AI++;
+
+    if(isWrite){
+      vector<ir::Register> srcTupleData;
+      srcTupleData.push_back(getRegister(*(AI++)));
+      srcTupleData.push_back(getRegister(*(AI++)));
+      for(int i = 0;i < vec_size; i++)
+        srcTupleData.push_back(getRegister(*(AI), i));
+      AI++;
+      const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
+      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
+    } else {
+      ir::Register src[2];
+      src[0] = getRegister(*(AI++));
+      src[1] = getRegister(*(AI++));
+      vector<ir::Register> dstTupleData;
+      for(int i = 0;i < vec_size; i++)
+        dstTupleData.push_back(getRegister(&I, i));
+      const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
+      const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
+      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
+    }
+
+    GBE_ASSERT(AI == AE);
+  }
+
+
   /* append a new sampler. should be called before any reference to
    * a sampler_t value. */
   uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
@@ -4841,6 +4881,22 @@ namespace gbe
             this->emitBlockReadWriteMemInst(I, CS, false); break;
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
             this->emitBlockReadWriteMemInst(I, CS, true); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 003be91..456ab58 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -219,6 +219,14 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_in
 
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
 
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 53fd320..e60bf4b 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -682,7 +682,21 @@ namespace gbe {
             *CI = InsertToVector(call, *CI);
             break;
           }
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+          {
+            ++CI;
+            ++CI;
+            if ((*CI)->getType()->isVectorTy())
+              *CI = InsertToVector(call, *CI);
+            break;
+          }
           case GEN_OCL_VME:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
             setAppendPoint(call);
             extractFromVector(call);
             break;
-- 
2.7.4



More information about the Beignet mailing list