[Beignet] [PATCH v2 4/5] implement extension cl_intel_media_block_io WRITE related function

xionghu.luo at intel.com xionghu.luo at intel.com
Wed Mar 8 15:00:40 UTC 2017


From: Luo Xionghu <xionghu.luo at intel.com>

v2: use static fixBlockSize; no need set default width/height in IR
level.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp |  27 +++++--
 backend/src/ir/instruction.cpp             |  14 +++-
 backend/src/ir/instruction.hpp             |   4 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 116 ++++++++++++++++++++++++-----
 backend/src/libocl/tmpl/ocl_simd.tmpl.h    |  17 +++++
 backend/src/llvm/llvm_gen_backend.cpp      |  34 ++++++++-
 backend/src/llvm/llvm_gen_ocl_function.hxx |   7 ++
 backend/src/llvm/llvm_scalarize.cpp        |   6 ++
 8 files changed, 194 insertions(+), 31 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 0ee8d13..9178448 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -7940,12 +7940,20 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       uint32_t simdWidth = sel.curr.execWidth;
       const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW;
       const RegisterFamily family = getFamily(type);
-      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      uint32_t typeSize = 0;
+      if(type == TYPE_U32) {
+        typeSize = 4;
+      }else if(type == TYPE_U16) {
+        typeSize = 2;
+      }else if(type == TYPE_U8) {
+        typeSize = 1;
+      }else
+        NOT_IMPLEMENTED;
       // ushort in simd8 will have half reg, but data lenght is still 1
       uint32_t data_size = simdWidth * vec_size * typeSize / 32;
       data_size = data_size? data_size : 1;
-      uint32_t block_width = typeSize * simdWidth;
-      uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
+      uint32_t block_width = 0;
+      uint32_t blocksize = fixBlockSize(insn.getWidth(), insn.getHeight(), vec_size, typeSize, simdWidth, block_width);
 
 
       vector<GenRegister> valuesVec;
@@ -7980,7 +7988,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
         sel.MOV(blocksizereg, GenRegister::immud(blocksize));
       sel.pop();
 
-      if (simdWidth * typeSize < 64) {
+      if (block_width < 64) {
         for (uint32_t i = 0; i < vec_size; ++i) {
             sel.MOV(tmpVec[i], valuesVec[i]);
         }
@@ -7989,9 +7997,16 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.noMask = 1;
           // Now write the data
-          sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
+          if(typeSize == 1) {
+            for (uint32_t i = 0; i < vec_size; i++) {
+                sel.MOV(sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth), valuesVec[i]);
+                sel.MOV(sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 8), sel.getOffsetReg(valuesVec[i], 0, 16) );
+            }
+            sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
+          } else
+            sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size);
         sel.pop();
-      } else if (simdWidth * typeSize == 64) {
+      } else if (block_width == 64) {
         sel.push();
           sel.curr.execWidth = 8;
           sel.curr.predicate = GEN_PREDICATE_NONE;
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 72d914a..a9156ff 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1117,13 +1117,15 @@ namespace ir {
     {
     public:
 
-      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
+      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type, uint8_t width, uint8_t height) {
         this->opcode = OP_MBWRITE;
         this->src = srcTuple;
         this->srcNum = srcNum;
         this->imageIdx = imageIdx;
         this->vec_size = vec_size;
         this->type = type;
+        this->width = width;
+        this->height = height;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
@@ -1141,6 +1143,8 @@ namespace ir {
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
       INLINE Type getType(void) const { return this->type; }
+      INLINE uint8_t getWidth(void) const { return this->width; }
+      INLINE uint8_t getHeight(void) const { return this->height; }
 
       Tuple src;
       Register dst[0];
@@ -1148,6 +1152,8 @@ namespace ir {
       uint8_t srcNum;
       uint8_t vec_size;
       Type type;
+      uint8_t width;
+      uint8_t height;
     };
 
 #undef ALIGNED_INSTRUCTION
@@ -2420,6 +2426,8 @@ DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getHeight(void), getHeight())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
 DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType())
+DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getWidth(void), getWidth())
+DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getHeight(void), getHeight())
 
 #undef DECL_MEM_FN
 
@@ -2732,8 +2740,8 @@ DECL_MEM_FN(MemInstruction, void,     setBtiReg(Register reg), setBtiReg(reg))
     return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type, width, height).convert();
   }
 
-  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
-    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type).convert();
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type, uint8_t width, uint8_t height) {
+    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type, width, height).convert();
   }
 
 
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 7e90576..8685dd4 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -658,6 +658,8 @@ namespace ir {
     uint8_t getImageIndex() const;
     uint8_t getVectorSize() const;
     Type getType(void) const;
+    uint8_t getWidth() const;
+    uint8_t getHeight() const;
   };
 
   /*! Specialize the instruction. Also performs typechecking first based on the
@@ -897,7 +899,7 @@ namespace ir {
   /*! media block read */
   Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type, uint8_t width, uint8_t height);
   /*! media block write */
-  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type);
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type, uint8_t width, uint8_t height);
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 55bf6f0..002378a 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -319,41 +319,61 @@ OVERLOADABLE uint8 intel_sub_group_media_block_read_ui8(int2 src_byte_offset, in
   return __gen_ocl_sub_group_block_read_ui_image8(image, src_byte_offset.x, src_byte_offset.y, width, height);
 }
 
-void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data);
-void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, uint2 data);
-void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, uint4 data);
-void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, uint8 data);
+void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, int w, int h, uint data);
+void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, int w, int h, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, int w, int h, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, int w, int h, uint8 data);
 OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data)
 {
-  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data)
 {
-  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data)
 {
-  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data)
 {
-  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint data)
 {
-  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, uint2 data)
 {
-  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, uint4 data)
 {
-  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, uint8 data)
 {
-  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, 0, 0, data);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_ui(int2 src_byte_offset, int width, int height, uint texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_ui_image(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_ui2(int2 src_byte_offset, int width, int height, uint2 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_ui_image2(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_ui4(int2 src_byte_offset, int width, int height, uint4 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_ui_image4(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_ui8(int2 src_byte_offset, int width, int height, uint8 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_ui_image8(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
 }
 
 PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* p);
@@ -445,25 +465,51 @@ OVERLOADABLE ushort16 intel_sub_group_media_block_read_us16(int2 src_byte_offset
   return __gen_ocl_sub_group_block_read_us_image16(image, src_byte_offset.x, src_byte_offset.y, width, height);
 }
 
-void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data);
-void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, ushort2 data);
-void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, ushort4 data);
-void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, ushort8 data);
+void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, int w, int h, ushort data);
+void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, int w, int h, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, int w, int h, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, int w, int h, ushort8 data);
+void __gen_ocl_sub_group_block_write_us_image16(image2d_t p, int x, int y, int w, int h, ushort16 data);
 OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, ushort data)
 {
-  __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, ushort2 data)
 {
-  __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, ushort4 data)
 {
-  __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, 0, 0, data);
 }
 OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort8 data)
 {
-  __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, 0, 0, data);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_us(int2 src_byte_offset, int width, int height, ushort texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_us_image(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_us2(int2 src_byte_offset, int width, int height, ushort2 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_us_image2(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_us4(int2 src_byte_offset, int width, int height, ushort4 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_us_image4(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_us8(int2 src_byte_offset, int width, int height, ushort8 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_us_image8(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_us16(int2 src_byte_offset, int width, int height, ushort16 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_us_image16(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
 }
 
 PURE CONST uchar __gen_ocl_sub_group_block_read_uc_image(image2d_t p, int x, int y, int w, int h);
@@ -496,6 +542,36 @@ OVERLOADABLE uchar16 intel_sub_group_media_block_read_uc16(int2 src_byte_offset,
   return __gen_ocl_sub_group_block_read_uc_image16(image, src_byte_offset.x, src_byte_offset.y, width, height);
 }
 
+void __gen_ocl_sub_group_block_write_uc_image(image2d_t p, int x, int y, int w, int h, uchar data);
+void __gen_ocl_sub_group_block_write_uc_image2(image2d_t p, int x, int y, int w, int h, uchar2 data);
+void __gen_ocl_sub_group_block_write_uc_image4(image2d_t p, int x, int y, int w, int h, uchar4 data);
+void __gen_ocl_sub_group_block_write_uc_image8(image2d_t p, int x, int y, int w, int h, uchar8 data);
+void __gen_ocl_sub_group_block_write_uc_image16(image2d_t p, int x, int y, int w, int h, uchar16 data);
+OVERLOADABLE void intel_sub_group_media_block_write_uc(int2 src_byte_offset, int width, int height, uchar texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_uc_image(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_uc2(int2 src_byte_offset, int width, int height, uchar2 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_uc_image2(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_uc4(int2 src_byte_offset, int width, int height, uchar4 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_uc_image4(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_uc8(int2 src_byte_offset, int width, int height, uchar8 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_uc_image8(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
+OVERLOADABLE void intel_sub_group_media_block_write_uc16(int2 src_byte_offset, int width, int height, uchar16 texels, image2d_t image)
+{
+  __gen_ocl_sub_group_block_write_uc_image16(image, src_byte_offset.x, src_byte_offset.y, width, height, texels);
+}
+
 #define SHUFFLE_DOWN(TYPE) \
 OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
   TYPE res0, res1; \
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 2592d10..b64bf49 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -231,6 +231,11 @@ OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 byte_coo
 OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 byte_coord, uint4 data);
 OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 byte_coord, uint8 data);
 
+OVERLOADABLE void intel_sub_group_media_block_write_ui(int2 src_byte_offset, int width, int height, uint texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_ui2(int2 src_byte_offset, int width, int height, uint2 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_ui4(int2 src_byte_offset, int width, int height, uint4 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_ui8(int2 src_byte_offset, int width, int height, uint8 texels, image2d_t image);
+
 OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p);
 OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p);
 OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p);
@@ -251,6 +256,18 @@ OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coo
 OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data);
 OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data);
 
+OVERLOADABLE void intel_sub_group_media_block_write_uc(int2 src_byte_offset, int width, int height, uchar texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_uc2(int2 src_byte_offset, int width, int height, uchar2 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_uc4(int2 src_byte_offset, int width, int height, uchar4 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_uc8(int2 src_byte_offset, int width, int height, uchar8 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_uc16(int2 src_byte_offset, int width, int height, uchar16 texels, image2d_t image);
+
+OVERLOADABLE void intel_sub_group_media_block_write_us(int2 src_byte_offset, int width, int height, ushort texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_us2(int2 src_byte_offset, int width, int height, ushort2 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_us4(int2 src_byte_offset, int width, int height, ushort4 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_us8(int2 src_byte_offset, int width, int height, ushort8 texels, image2d_t image);
+OVERLOADABLE void intel_sub_group_media_block_write_us16(int2 src_byte_offset, int width, int height, ushort16 texels, image2d_t image);
+
 OVERLOADABLE uchar intel_sub_group_media_block_read_uc(int2 src_byte_offset, int width, int height, read_only image2d_t image);
 OVERLOADABLE uchar2 intel_sub_group_media_block_read_uc2(int2 src_byte_offset, int width, int height, read_only image2d_t image);
 OVERLOADABLE uchar4 intel_sub_group_media_block_read_uc4(int2 src_byte_offset, int width, int height, read_only image2d_t image);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index faa9c37..d7dabe3 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -4121,6 +4121,12 @@ namespace gbe
       case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
       case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
       case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE16:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE16:
         break;
       case GEN_OCL_NOT_FOUND:
       default:
@@ -4537,11 +4543,25 @@ namespace gbe
       vector<ir::Register> srcTupleData;
       srcTupleData.push_back(getRegister(*(AI++)));
       srcTupleData.push_back(getRegister(*(AI++)));
+      Constant *CWidth = dyn_cast<Constant>(*AI++);
+      GBE_ASSERT(CWidth != NULL);
+      const ir::Immediate &width = processConstantImm(CWidth);
+      Constant *CHeight = dyn_cast<Constant>(*AI++);
+      GBE_ASSERT(CHeight != NULL);
+      const ir::Immediate &height = processConstantImm(CHeight);
+      const uint8_t iwidth = width.getIntegerValue();
+      const uint8_t iheight = height.getIntegerValue();
+      // check width and height legality.
+      if (iwidth != 0 || iheight!= 0) {
+        checkMediaBlockWidthandHeight(I, iwidth, iheight, vec_size, type);
+        if(!ctx.getUnit().getValid())
+          return;
+      }
       for(int i = 0;i < vec_size; i++)
         srcTupleData.push_back(getRegister(*(AI), i));
       AI++;
       const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
-      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type);
+      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type, iwidth, iheight);
     } else {
       ir::Register src[2];
       src[0] = getRegister(*(AI++));
@@ -5568,6 +5588,18 @@ namespace gbe
             this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); break;
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
             this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE16:
+            this->emitBlockReadWriteImageInst(I, CS, true, 16, ir::TYPE_U16); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE16:
+            this->emitBlockReadWriteImageInst(I, CS, true, 16, ir::TYPE_U8); break;
           case GEN_OCL_GET_PIPE:
           case GEN_OCL_MAKE_RID:
           case GEN_OCL_GET_RID:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 0243f05..5682c45 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -255,11 +255,18 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE16, __gen_ocl_sub_group_block_write_us_image16)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE, __gen_ocl_sub_group_block_read_uc_image)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE2, __gen_ocl_sub_group_block_read_uc_image2)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE4, __gen_ocl_sub_group_block_read_uc_image4)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE8, __gen_ocl_sub_group_block_read_uc_image8)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE16, __gen_ocl_sub_group_block_read_uc_image16)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE, __gen_ocl_sub_group_block_write_uc_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE2, __gen_ocl_sub_group_block_write_uc_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE4, __gen_ocl_sub_group_block_write_uc_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE8, __gen_ocl_sub_group_block_write_uc_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE16, __gen_ocl_sub_group_block_write_uc_image16)
+// common function
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
 
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index c413ab4..6f46c9d 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -690,6 +690,12 @@ namespace gbe {
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
           case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE16:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE16:
           {
             ++CI;
             ++CI;
-- 
2.5.0



More information about the Beignet mailing list