[Beignet] [PATCH 1/4] GBE: optimize sample instruction.

Zhigang Gong zhigang.gong at intel.com
Tue Feb 18 16:38:12 PST 2014


The U,V,W registers could be allocated to a selection vector directly.
Then we can save some MOV instructions for the read_image functions.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_context.cpp        |   20 ++++----------------
 backend/src/backend/gen_encoder.cpp        |    4 ++--
 backend/src/backend/gen_encoder.hpp        |    2 +-
 backend/src/backend/gen_insn_selection.cpp |   26 ++++++++++++--------------
 backend/src/backend/gen_insn_selection.hpp |    2 +-
 backend/src/ocl_stdlib.tmpl.h              |   12 ++++++------
 6 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 7a74856..addf96e 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1784,23 +1784,11 @@ namespace gbe
     const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
     const unsigned char bti = insn.extra.rdbti;
     const unsigned char sampler = insn.extra.sampler;
-    const GenRegister ucoord = ra->genReg(insn.src(4));
-    const GenRegister vcoord = ra->genReg(insn.src(5));
+    const unsigned int msgLen = insn.extra.rdmsglen;
     uint32_t simdWidth = p->curr.execWidth;
-    uint32_t coord_cnt = 2;
-    p->push();
-    const uint32_t nr = msgPayload.nr;
-    // prepare mesg desc and move to a0.0.
-    // desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31)
-    /* Prepare message payload. */
-    p->MOV(GenRegister::f8grf(nr , 0), ucoord);
-    p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
-    if (insn.extra.is3DRead) {
-      p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), ra->genReg(insn.src(6)));
-      coord_cnt++;
-    }
-    p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
-    p->pop();
+    //p->push();
+    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0);
+    //p->pop();
   }
 
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index aaf7dce..0664d77 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1207,10 +1207,10 @@ namespace gbe
 
   void GenEncoder::SAMPLE(GenRegister dest,
                           GenRegister msg,
+                          unsigned int msg_len,
                           bool header_present,
                           unsigned char bti,
                           unsigned char sampler,
-                          unsigned int coord_cnt,
                           uint32_t simdWidth,
                           uint32_t writemask,
                           uint32_t return_format)
@@ -1219,7 +1219,7 @@ namespace gbe
      uint32_t msg_type =  (simdWidth == 16) ?
                             GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
      uint32_t response_length = (4 * (simdWidth / 8));
-     uint32_t msg_length = (coord_cnt * (simdWidth / 8));
+     uint32_t msg_length = (msg_len * (simdWidth / 8));
      if (header_present)
        msg_length++;
      uint32_t simd_mode = (simdWidth == 16) ?
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 13db6ae..094a5c2 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -167,10 +167,10 @@ namespace gbe
     /*! Send instruction for the sampler */
     void SAMPLE(GenRegister dest,
                 GenRegister msg,
+                unsigned int msg_len,
                 bool header_present,
                 unsigned char bti,
                 unsigned char sampler,
-                unsigned int coord_cnt,
                 unsigned int simdWidth,
                 uint32_t writemask,
                 uint32_t return_format);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 75ee906..d76f580 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -543,7 +543,7 @@ namespace gbe
     /*! Encode ternary instructions */
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
-    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
+    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
     /*! Encode typed write instructions */
     void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
@@ -1415,10 +1415,9 @@ namespace gbe
    }
 
   void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
-                                 GenRegister *src, uint32_t srcNum,
                                  GenRegister *msgPayloads, uint32_t msgNum,
                                  uint32_t bti, uint32_t sampler, bool is3D) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
     SelectionVector *dstVector = this->appendVector();
     SelectionVector *msgVector = this->appendVector();
 
@@ -1427,8 +1426,6 @@ namespace gbe
       insn->dst(elemID) = dst[elemID];
     for (uint32_t elemID = 0; elemID < msgNum; ++elemID)
       insn->src(elemID) = msgPayloads[elemID];
-    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
-      insn->src(msgNum + elemID) = src[elemID];
 
     // Sends require contiguous allocation
     dstVector->regNum = dstNum;
@@ -1442,7 +1439,7 @@ namespace gbe
 
     insn->extra.rdbti = bti;
     insn->extra.sampler = sampler;
-    insn->extra.is3DRead = is3D;
+    insn->extra.rdmsglen = msgNum;
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -3009,23 +3006,24 @@ namespace gbe
     {
       using namespace ir;
       GenRegister msgPayloads[4];
-      GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()];
+      GenRegister dst[insn.getDstNum()];
       uint32_t srcNum = insn.getSrcNum();
+      uint32_t valueID = 0;
 
-      for( int i = 0; i < 4; ++i)
-        msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-
-      for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
+      for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      for (uint32_t valueID = 0; valueID < srcNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      if (!insn.is3D())
+        srcNum--;
+      /* U, V, [W] */
+      for (valueID = 0; valueID < srcNum; ++valueID)
+        msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
 
       uint32_t bti = insn.getImageIndex();
       /* We have the clamp border workaround. */
       uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8;
 
-      sel.SAMPLE(dst, insn.getDstNum(), src, srcNum, msgPayloads, 4, bti, sampler, insn.is3D());
+      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, srcNum, bti, sampler, insn.is3D());
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 09e6762..cb80d7c 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -120,7 +120,7 @@ namespace gbe
       struct {
         uint16_t rdbti:8;
         uint16_t sampler:5;
-        uint16_t is3DRead:1;
+        uint16_t rdmsglen:3;
       };
       uint32_t barrierType;
     } extra;
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index d2cc144..cac03d6 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -4232,18 +4232,18 @@ int __gen_ocl_force_simd16(void);
 // Image access functions
 /////////////////////////////////////////////////////////////////////////////
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+//OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+//OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+//OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+//OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+//OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+//OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
 OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-- 
1.7.9.5



More information about the Beignet mailing list