[Beignet] [PATCH 2/2] Use 4bits vector immediate to optimize the a0 setting.

junyan.he at inbox.com junyan.he at inbox.com
Tue Apr 14 08:02:27 PDT 2015


From: Junyan He <junyan.he at linux.intel.com>

We can use 4bits immediate to load a0 with offset first
and then add the base address use add(8), like:

  mov(8)   a0<1>:UW   0x563412f0:V            { align1 WE_all 1Q };
  add(8)   a0<1>:W    a0<8,8,1>:W   3489:W    { align1 WE_all 1Q };

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen8_context.cpp |    8 ++---
 backend/src/backend/gen8_context.hpp |    2 +-
 backend/src/backend/gen_context.cpp  |   58 ++++++++++++++++++++++++++--------
 backend/src/backend/gen_context.hpp  |    2 +-
 4 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 2cdb248..b525236 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -92,7 +92,7 @@ namespace gbe
               new_a0[1] = start_addr + 2;
               new_a0[2] = start_addr + 1;
               new_a0[3] = start_addr;
-              this->setA0Content(new_a0, 0, 4);
+              this->setA0Content(new_a0, 4);
 
               p->push();
               p->curr.execWidth = 4;
@@ -152,7 +152,7 @@ namespace gbe
                 new_a0[14] = start_addr + 1;
                 new_a0[15] = start_addr;
               }
-              this->setA0Content(new_a0, 48);
+              this->setA0Content(new_a0);
 
               p->push();
               p->curr.execWidth = 16;
@@ -210,7 +210,7 @@ namespace gbe
                 new_a0[14] = start_addr + 1;
                 new_a0[15] = start_addr;
               }
-              this->setA0Content(new_a0, 48);
+              this->setA0Content(new_a0);
 
               p->push();
               p->curr.execWidth = 16;
@@ -858,7 +858,7 @@ namespace gbe
     this->unpackLongVec(src, dst, p->curr.execWidth);
   }
 
-  void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+  void Gen8Context::setA0Content(uint16_t new_a0[16], int sz) {
     if (sz == 0)
       sz = 16;
     GBE_ASSERT(sz%4 == 0);
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index b296a3d..96b3d69 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -72,7 +72,7 @@ namespace gbe
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
 
   protected:
-    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+    virtual void setA0Content(uint16_t new_a0[16], int sz = 0);
     virtual GenEncoder* generateEncoder(void) {
       return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
     }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 684ecaf..714da36 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -333,7 +333,7 @@ namespace gbe
             new_a0[1] = start_addr + 2;
             new_a0[2] = start_addr + 1;
             new_a0[3] = start_addr;
-            this->setA0Content(new_a0, 0, 4);
+            this->setA0Content(new_a0, 4);
 
             p->push();
             p->curr.execWidth = 4;
@@ -377,7 +377,7 @@ namespace gbe
               new_a0[6] = start_addr + 1;
               new_a0[7] = start_addr;
             }
-            this->setA0Content(new_a0, 56);
+            this->setA0Content(new_a0);
 
             p->push();
             p->curr.execWidth = 8;
@@ -421,7 +421,7 @@ namespace gbe
               new_a0[6] = start_addr + 1;
               new_a0[7] = start_addr;
             }
-            this->setA0Content(new_a0, 56);
+            this->setA0Content(new_a0);
 
             p->push();
             p->curr.execWidth = 8;
@@ -1946,21 +1946,51 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
-  void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+  void GenContext::setA0Content(uint16_t new_a0[16], int sz) {
     if (sz == 0)
       sz = 8;
-    GBE_ASSERT(sz%4 == 0);
-    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
 
-    p->push();
-    p->curr.execWidth = 1;
-    p->curr.predicate = GEN_PREDICATE_NONE;
-    p->curr.noMask = 1;
-    for (int i = 0; i < sz/2; i++) {
-      p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
-             GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+    /* We use a0.0 as base, if all the a0.x sub registers' diff is within 2^4 = [-8, 7],
+       we can use load 8x4bits unsigned int + add offset to optimize it. */
+    bool inRange = true;
+    struct v4bits_help {
+      int8_t dummy:4;
+      int8_t val:4;
+    };
+    uint32_t diff = 0;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[i] < 4096);
+      int16_t d = new_a0[i] - new_a0[0];
+      if (d < -8 || d > 7) {
+        inRange = false;
+        break;
+      }
+      v4bits_help dv;
+      dv.val = d;
+      diff = diff | ((dv.val & 0xf) << i*4);
+    }
+
+    if (inRange && sz >=4) {
+      p->push();
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(GenRegister::addr8(0), GenRegister::immv(diff));
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+             GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+             GenRegister::immw(new_a0[0]));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < (sz+1)/2; i++) {
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+               GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+      }
+      p->pop();
     }
-    p->pop();
   }
 
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 560248a..96e0574 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -208,7 +208,7 @@ namespace gbe
     /*! allocate a new curbe register and insert to curbe pool. */
     void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
 
-    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+    virtual void setA0Content(uint16_t new_a0[16], int sz = 0);
 
   private:
     CompileErrorCode errCode;
-- 
1.7.9.5





More information about the Beignet mailing list