[Beignet] [PATCH 7/9 V3] Add a0 setting and bswap logic for GEN8

junyan.he at inbox.com junyan.he at inbox.com
Mon Mar 9 01:11:22 PDT 2015


From: Junyan He <junyan.he at linux.intel.com>

Because Gen8 has 16 sub-registers for A0, we can use
them to decrease the instructions number.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen8_context.cpp |  174 ++++++++++++++++++++++++++++++++++
 backend/src/backend/gen8_context.hpp |    1 +
 2 files changed, 175 insertions(+)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 4edfd81..0d4a40e 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -65,11 +65,141 @@ namespace gbe
 
   void Gen8Context::emitUnaryWithTempInstruction(const SelectionInstruction &insn)
   {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister tmp = ra->genReg(insn.dst(1));
     switch (insn.opcode) {
       case SEL_OP_CONVI_TO_I64:
         /* Should never come to here, just use the common OPCODE. */
         GBE_ASSERT(0);
         break;
+      case SEL_OP_BSWAP:
+        {
+          uint32_t simd = p->curr.execWidth;
+          GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+          uint16_t new_a0[16];
+          memset(new_a0, 0, sizeof(new_a0));
+
+          GBE_ASSERT(src.type == dst.type);
+          uint32_t start_addr = src.nr*32 + src.subnr;
+
+          if (simd == 1) {
+            GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+                && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+            if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              this->setA0Content(new_a0, 0, 4);
+
+              p->push();
+              p->curr.execWidth = 4;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                  a0[0], new_a0[0] - a0[0]);
+              GenRegister dst_ = dst;
+              dst_.type = GEN_TYPE_UB;
+              dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+              dst_.width = GEN_WIDTH_4;
+              dst_.vstride = GEN_VERTICAL_STRIDE_4;
+              p->MOV(dst_, ind_src);
+              p->pop();
+            } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+              p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                  GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+              p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                  GenRegister::retype(src, GEN_TYPE_UB));
+            } else {
+              GBE_ASSERT(0);
+            }
+          } else {
+            if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+              GBE_ASSERT(src.subnr == 0);
+              GBE_ASSERT(dst.subnr == 0);
+              GBE_ASSERT(tmp.subnr == 0);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              new_a0[4] = start_addr + 7;
+              new_a0[5] = start_addr + 6;
+              new_a0[6] = start_addr + 5;
+              new_a0[7] = start_addr + 4;
+              new_a0[8] = start_addr + 11;
+              new_a0[9] = start_addr + 10;
+              new_a0[10] = start_addr + 9;
+              new_a0[11] = start_addr + 8;
+              new_a0[12] = start_addr + 15;
+              new_a0[13] = start_addr + 14;
+              new_a0[14] = start_addr + 13;
+              new_a0[15] = start_addr + 12;
+              this->setA0Content(new_a0, 48);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                  a0[0], new_a0[0] - a0[0]);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              ind_src.addr_imm += 16;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              if (simd == 16) {
+                for (int i = 0; i < 2; i++) {
+                  ind_src.addr_imm += 16;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 16*i), ind_src);
+                }
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
+            } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+              GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
+              GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+              GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 1;
+              new_a0[1] = start_addr;
+              new_a0[2] = start_addr + 3;
+              new_a0[3] = start_addr + 2;
+              new_a0[4] = start_addr + 5;
+              new_a0[5] = start_addr + 4;
+              new_a0[6] = start_addr + 7;
+              new_a0[7] = start_addr + 6;
+              new_a0[8] = start_addr + 9;
+              new_a0[9] = start_addr + 8;
+              new_a0[10] = start_addr + 11;
+              new_a0[11] = start_addr + 10;
+              new_a0[12] = start_addr + 13;
+              new_a0[13] = start_addr + 12;
+              new_a0[14] = start_addr + 15;
+              new_a0[15] = start_addr + 14;
+              this->setA0Content(new_a0, 48);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                  a0[0], new_a0[0] - a0[0]);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              if (simd == 16) {
+                ind_src.addr_imm += 16;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
+            } else {
+              GBE_ASSERT(0);
+            }
+          }
+        }
+        break;
       default:
         GenContext::emitUnaryWithTempInstruction(insn);
     }
@@ -782,4 +912,48 @@ namespace gbe
     GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0 && src.hstride != GEN_HORIZONTAL_STRIDE_0);
     this->unpackLongVec(src, dst, p->curr.execWidth);
   }
+
+  void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(this->a0[0] + diff < 4096 && this->a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset && sz == 16) {
+      p->push();
+      p->curr.execWidth = 16;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/4; i++) {
+        uint64_t addr = (new_a0[i*4 + 3] << 16) | (new_a0[i*4 + 2]);
+        addr = addr << 32;
+        addr = addr | (new_a0[i*4 + 1] << 16) | (new_a0[i*4]);
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*4), GEN_TYPE_UL), GenRegister::immuint64(addr));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index a047990..b296a3d 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -72,6 +72,7 @@ namespace gbe
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
 
   protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
     virtual GenEncoder* generateEncoder(void) {
       return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
     }
-- 
1.7.9.5



More information about the Beignet mailing list