[Beignet] [PATCH v2 1/5] add extension cl_intel_media_block_io READ related function

xionghu.luo at intel.com xionghu.luo at intel.com
Wed Mar 8 15:00:37 UTC 2017


From: Luo Xionghu <xionghu.luo at intel.com>

v2: add #define intel_media_block_io in libocl; move extension check
code to this patch;

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp |  52 +++++++++++--
 backend/src/ir/instruction.cpp             |  14 +++-
 backend/src/ir/instruction.hpp             |   4 +-
 backend/src/libocl/include/ocl.h           |   1 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 117 ++++++++++++++++++++++++-----
 backend/src/libocl/tmpl/ocl_simd.tmpl.h    |  17 +++++
 backend/src/llvm/llvm_gen_backend.cpp      |  89 +++++++++++++++++++++-
 backend/src/llvm/llvm_gen_ocl_function.hxx |   6 ++
 backend/src/llvm/llvm_scalarize.cpp        |   5 ++
 src/cl_extensions.h                        |   1 +
 utests/utest_helper.cpp                    |  20 +++++
 utests/utest_helper.hpp                    |   3 +
 12 files changed, 296 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 1cab40c..0ee8d13 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -7808,6 +7808,27 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
     }
   };
 
+  static uint32_t fixBlockSize(uint8_t width, uint8_t height, uint32_t vec_size,
+                               uint32_t typeSize, uint32_t simdWidth,
+                               uint32_t &block_width) {
+    uint32_t blocksize = 0;
+    if (width && height) {
+      if (width * height * typeSize > vec_size * simdWidth * typeSize) {
+        if (width <= simdWidth * vec_size) {
+          height = vec_size * simdWidth / width;
+        } else {
+          height = 1;
+          width = vec_size * simdWidth / height;
+        }
+      }
+    } else {
+      width = simdWidth;
+      height = vec_size;
+    }
+    block_width = typeSize * (width < simdWidth ? width : simdWidth);
+    blocksize = (block_width - 1) % 32 | (height - 1) << 16;
+    return blocksize;
+  }
   /*! Media Block Read pattern */
   DECL_PATTERN(MediaBlockReadInstruction)
   {
@@ -7817,19 +7838,26 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint32_t vec_size = insn.getVectorSize();
       uint32_t simdWidth = sel.curr.execWidth;
       const Type type = insn.getType();
-      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      uint32_t typeSize = 0;
+      if(type == TYPE_U32) {
+        typeSize = 4;
+      }else if(type == TYPE_U16) {
+        typeSize = 2;
+      }else if(type == TYPE_U8) {
+        typeSize = 1;
+      }else
+        NOT_IMPLEMENTED;
       uint32_t response_size = simdWidth * vec_size * typeSize / 32;
       // ushort in simd8 will have half reg thus 0.5 reg size, but response lenght is still 1
       response_size = response_size ? response_size : 1;
-      uint32_t block_width = typeSize * simdWidth;
-      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
-
+      uint32_t block_width = 0;
+      uint32_t blocksize = fixBlockSize(insn.getWidth(), insn.getHeight(), vec_size, typeSize, simdWidth, block_width);
 
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for (uint32_t i = 0; i < vec_size; ++i) {
         valuesVec.push_back(sel.selReg(insn.getDst(i), type));
-        if(simdWidth == 16 && typeSize == 4)
+        if((simdWidth == 16 && typeSize == 4) || typeSize == 1)
           tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG)));
       }
       const GenRegister coordx = GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), GEN_TYPE_UD);
@@ -7855,15 +7883,23 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         sel.MOV(blocksizereg, GenRegister::immud(blocksize));
       sel.pop();
 
-      if (simdWidth * typeSize < 64) {
+      if (block_width < 64) {
         sel.push();
           sel.curr.execWidth = 8;
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.noMask = 1;
           // Now read the data
-          sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size);
+          if(typeSize == 1) {
+            sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), response_size);
+            for (uint32_t i = 0; i < vec_size; i++) {
+              sel.MOV(valuesVec[i], sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth));
+              sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 16), sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 8));
+            }
+          }else
+            sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), response_size);
+
         sel.pop();
-      } else if (simdWidth * typeSize == 64) {
+      } else if (block_width == 64) {
         sel.push();
           sel.curr.execWidth = 8;
           sel.curr.predicate = GEN_PREDICATE_NONE;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index f0c3957..72d914a 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1070,7 +1070,7 @@ namespace ir {
       public TupleDstPolicy<MediaBlockReadInstruction>
     {
     public:
-      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type) {
+      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type, uint8_t width, uint8_t height) {
         this->opcode = OP_MBREAD;
         this->dst = dst;
         this->dstNum = vec_size;
@@ -1078,6 +1078,8 @@ namespace ir {
         this->srcNum = srcNum;
         this->imageIdx = imageIdx;
         this->type = type;
+        this->width = width;
+        this->height = height;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
@@ -1095,6 +1097,8 @@ namespace ir {
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
       INLINE Type getType(void) const { return this->type; }
+      INLINE uint8_t getWidth(void) const { return this->width; }
+      INLINE uint8_t getHeight(void) const { return this->height; }
 
       Tuple src;
       Tuple dst;
@@ -1102,6 +1106,8 @@ namespace ir {
       uint8_t srcNum;
       uint8_t dstNum;
       Type type;
+      uint8_t width;
+      uint8_t height;
     };
 
     class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
@@ -2409,6 +2415,8 @@ DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), g
 DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize())
 DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType())
+DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getWidth(void), getWidth())
+DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getHeight(void), getHeight())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
 DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType())
@@ -2720,8 +2728,8 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
     return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert();
   }
 
-  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type) {
-    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type).convert();
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type, uint8_t width, uint8_t height) {
+    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type, width, height).convert();
   }
 
   Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 16c2045..7e90576 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -646,6 +646,8 @@ namespace ir {
     uint8_t getImageIndex() const;
     uint8_t getVectorSize() const;
     Type getType(void) const;
+    uint8_t getWidth() const;
+    uint8_t getHeight() const;
   };
 
   /*! Media Block Write.  */
@@ -893,7 +895,7 @@ namespace ir {
   /*! printf */
   Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num);
   /*! media block read */
-  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type);
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type, uint8_t width, uint8_t height);
   /*! media block write */
   Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type);
 } /* namespace ir */
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 2548cb7..8ed878b 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -123,6 +123,7 @@
 #define cl_khr_3d_image_writes
 #define cl_intel_subgroups
 #define cl_intel_subgroups_short
+#define cl_intel_media_block_io
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 97e33fe..55bf6f0 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -262,41 +262,61 @@ OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data)
   __gen_ocl_sub_group_block_write_ui_mem8(p, data);
 }
 
-PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y, int w, int h);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y, int w, int h);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y, int w, int h);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y, int w, int h);
 OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y, 0, 0);
+}
+
+OVERLOADABLE uint intel_sub_group_media_block_read_ui(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_ui_image(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uint2 intel_sub_group_media_block_read_ui2(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_ui_image2(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uint4 intel_sub_group_media_block_read_ui4(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_ui_image4(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uint8 intel_sub_group_media_block_read_ui8(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_ui_image8(image, src_byte_offset.x, src_byte_offset.y, width, height);
 }
 
 void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data);
@@ -378,25 +398,51 @@ OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 data)
   __gen_ocl_sub_group_block_write_us_mem8(p, data);
 }
 
-PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y);
-PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y);
-PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y);
-PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y);
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y, int w, int h);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y, int w, int h);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y, int w, int h);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y, int w, int h);
+PURE CONST ushort16 __gen_ocl_sub_group_block_read_us_image16(image2d_t p, int x, int y, int w, int h);
 OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y, 0, 0);
 }
 OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y, 0, 0);
+}
+
+OVERLOADABLE ushort intel_sub_group_media_block_read_us(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_us_image(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE ushort2 intel_sub_group_media_block_read_us2(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_us_image2(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE ushort4 intel_sub_group_media_block_read_us4(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_us_image4(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE ushort8 intel_sub_group_media_block_read_us8(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_us_image8(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE ushort16 intel_sub_group_media_block_read_us16(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_us_image16(image, src_byte_offset.x, src_byte_offset.y, width, height);
 }
 
 void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data);
@@ -419,6 +465,37 @@ OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort
 {
   __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data);
 }
+
+PURE CONST uchar __gen_ocl_sub_group_block_read_uc_image(image2d_t p, int x, int y, int w, int h);
+PURE CONST uchar2 __gen_ocl_sub_group_block_read_uc_image2(image2d_t p, int x, int y, int w, int h);
+PURE CONST uchar4 __gen_ocl_sub_group_block_read_uc_image4(image2d_t p, int x, int y, int w, int h);
+PURE CONST uchar8 __gen_ocl_sub_group_block_read_uc_image8(image2d_t p, int x, int y, int w, int h);
+PURE CONST uchar16 __gen_ocl_sub_group_block_read_uc_image16(image2d_t p, int x, int y, int w, int h);
+OVERLOADABLE uchar intel_sub_group_media_block_read_uc(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_uc_image(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uchar2 intel_sub_group_media_block_read_uc2(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_uc_image2(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uchar4 intel_sub_group_media_block_read_uc4(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_uc_image4(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uchar8 intel_sub_group_media_block_read_uc8(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_uc_image8(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
+OVERLOADABLE uchar16 intel_sub_group_media_block_read_uc16(int2 src_byte_offset, int width, int height, read_only image2d_t image)
+{
+  return __gen_ocl_sub_group_block_read_uc_image16(image, src_byte_offset.x, src_byte_offset.y, width, height);
+}
+
 #define SHUFFLE_DOWN(TYPE) \
 OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
   TYPE res0, res1; \
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 608551b..2592d10 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -250,3 +250,20 @@ OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 byte_coor
 OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coord, ushort2 data);
 OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data);
 OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data);
+
+OVERLOADABLE uchar intel_sub_group_media_block_read_uc(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uchar2 intel_sub_group_media_block_read_uc2(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uchar4 intel_sub_group_media_block_read_uc4(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uchar8 intel_sub_group_media_block_read_uc8(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uchar16 intel_sub_group_media_block_read_uc16(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+
+OVERLOADABLE ushort intel_sub_group_media_block_read_us(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE ushort2 intel_sub_group_media_block_read_us2(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE ushort4 intel_sub_group_media_block_read_us4(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE ushort8 intel_sub_group_media_block_read_us8(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE ushort16 intel_sub_group_media_block_read_us16(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+
+OVERLOADABLE uint intel_sub_group_media_block_read_ui(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uint2 intel_sub_group_media_block_read_ui2(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uint4 intel_sub_group_media_block_read_ui4(int2 src_byte_offset, int width, int height, read_only image2d_t image);
+OVERLOADABLE uint8 intel_sub_group_media_block_read_ui8(int2 src_byte_offset, int width, int height, read_only image2d_t image);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3fefa92..faa9c37 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -737,6 +737,7 @@ namespace gbe
     // Emit subgroup instructions
     void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
     void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
+    void checkMediaBlockWidthandHeight(CallInst &I, uint8_t width, uint8_t height, uint8_t vec_size, ir::Type type);
 
     uint8_t appendSampler(CallSite::arg_iterator AI);
     uint8_t getImageID(CallInst &I);
@@ -4059,6 +4060,12 @@ namespace gbe
       case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
       case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
       case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE16:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE16:
       case GEN_OCL_ENQUEUE_SET_NDRANGE_INFO:
       case GEN_OCL_ENQUEUE_GET_NDRANGE_INFO:
         this->newRegister(&I);
@@ -4463,6 +4470,61 @@ namespace gbe
     GBE_ASSERT(AI == AE);
   }
 
+  void GenWriter::checkMediaBlockWidthandHeight(CallInst& I, uint8_t width, uint8_t height, uint8_t vec_size, ir::Type type) {
+    if (width == 0) {
+      has_errors = true;
+      Func->getContext().emitError(&I,"Media Block width value illegal, width is:" + width);
+      ctx.getUnit().setValid(false);
+      return;
+    }
+    if (height == 0) {
+      has_errors = true;
+      Func->getContext().emitError(&I,"Media Block height value illegal, height is:" + height);
+      ctx.getUnit().setValid(false);
+      return;
+    }
+    uint32_t typeSize;
+    if (type == ir::TYPE_U8)
+      typeSize = 1;
+    else if (type == ir::TYPE_U16)
+      typeSize = 2;
+    else
+      typeSize = 4;
+
+    uint32_t widthBytes = width * typeSize;
+
+    uint32_t maxRows;
+    if (widthBytes <= 4)
+      maxRows = 64;
+    else if (widthBytes <= 8)
+      maxRows = 32;
+    else if (widthBytes <= 16)
+      maxRows = 16;
+    else
+      maxRows = 8;
+
+    if (widthBytes % 4 != 0) {
+      has_errors = true;
+      Func->getContext().emitError(&I,"Media Block widthBytes value illegal, widthBytes is:" + widthBytes);
+      ctx.getUnit().setValid(false);
+      return;
+    }
+
+    if ((typeSize == 4 && widthBytes > 64) || (typeSize != 4 && widthBytes > 32)) {
+      has_errors = true;
+      Func->getContext().emitError(&I,"Media Block widthBytes value illegal, widthBytes is:" + widthBytes);
+      ctx.getUnit().setValid(false);
+      return;
+    }
+
+    if (height > maxRows) {
+      has_errors = true;
+      Func->getContext().emitError(&I,"Media Block height value illegal, height is larger than: "  + maxRows);
+      ctx.getUnit().setValid(false);
+      return;
+    }
+  }
+
   void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
@@ -4489,7 +4551,20 @@ namespace gbe
         dstTupleData.push_back(getRegister(&I, i));
       const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
       const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
-      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type);
+      Constant *CWidth = dyn_cast<Constant>(*AI++);
+      GBE_ASSERT(CWidth != NULL);
+      const ir::Immediate &width = processConstantImm(CWidth);
+      Constant *CHeight = dyn_cast<Constant>(*AI++);
+      GBE_ASSERT(CHeight != NULL);
+      const ir::Immediate &height = processConstantImm(CHeight);
+      // check width and height legality.
+      if (width.getIntegerValue() != 0 || height.getIntegerValue() != 0) {
+        checkMediaBlockWidthandHeight(I, width.getIntegerValue(), height.getIntegerValue(), vec_size, type);
+        if(!ctx.getUnit().getValid())
+          return;
+      }
+      //map w * h region to simd_size
+      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type, width.getIntegerValue(), height.getIntegerValue());
     }
 
     GBE_ASSERT(AI == AE);
@@ -5473,6 +5548,18 @@ namespace gbe
             this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); break;
           case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
             this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE16:
+            this->emitBlockReadWriteImageInst(I, CS, false, 16, ir::TYPE_U16); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE16:
+            this->emitBlockReadWriteImageInst(I, CS, false, 16, ir::TYPE_U8); break;
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
             this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); break;
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 86485da..0243f05 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -250,10 +250,16 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, __gen_ocl_sub_group_block_
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, __gen_ocl_sub_group_block_read_us_image2)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, __gen_ocl_sub_group_block_read_us_image4)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, __gen_ocl_sub_group_block_read_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE16, __gen_ocl_sub_group_block_read_us_image16)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block_write_us_image)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE, __gen_ocl_sub_group_block_read_uc_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE2, __gen_ocl_sub_group_block_read_uc_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE4, __gen_ocl_sub_group_block_read_uc_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE8, __gen_ocl_sub_group_block_read_uc_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE16, __gen_ocl_sub_group_block_read_uc_image16)
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
 
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 8850abb..c413ab4 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -723,6 +723,11 @@ namespace gbe {
           case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
           case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
           case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE16:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UC_IMAGE16:
             setAppendPoint(call);
             extractFromVector(call);
             break;
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 55747a7..af0855e 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -30,6 +30,7 @@
   DECL_EXT(intel_motion_estimation) \
   DECL_EXT(intel_subgroups) \
   DECL_EXT(intel_subgroups_short) \
+  DECL_EXT(intel_media_block_io) \
   DECL_EXT(intel_planar_yuv)
 
 #define DECL_GL_EXTENSIONS \
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 7052a14..f4487c1 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -939,6 +939,26 @@ int cl_check_subgroups_short(void)
   return 1;
 }
 
+int cl_check_media_block_io(void)
+{
+  if (!cl_check_subgroups())
+    return 0;
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, &param_value_size);
+  std::vector<char> param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+           param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+  if (!param_value.empty())
+    extStr = std::string(&param_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_intel_media_block_io") == NULL) {
+    printf("No cl_intel_media_block_io, Skip!");
+    return 0;
+  }
+  return 1;
+}
+
 int cl_check_ocl20(bool or_beignet)
 {
   size_t param_value_size;
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index e2a6a88..5dc381e 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -303,6 +303,9 @@ extern int cl_check_beignet(void);
 /* Check is intel subgroups enabled. */
 extern int cl_check_subgroups(void);
 
+/* Check is intel_media_block_io enabled. */
+extern int cl_check_media_block_io(void);
+
 typedef cl_int(clGetKernelSubGroupInfoKHR_cb)(cl_kernel, cl_device_id,
                                               cl_kernel_sub_group_info, size_t,
                                               const void *, size_t, void *,
-- 
2.5.0



More information about the Beignet mailing list