[Beignet] [PATCH] GBE: if PointerFamily is FAMILY_QWORD, chv and bxt need special handle.

Yang Rong rong.r.yang at intel.com
Thu Jan 5 09:11:34 UTC 2017


Mov the ud to uq, need move ud to unpacked ud first, and then mov
unpacked ud to uq.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen8_context.cpp | 61 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen8_context.hpp |  1 +
 backend/src/backend/gen9_context.cpp | 62 ++++++++++++++++++++++++++++++++++++
 backend/src/backend/gen9_context.hpp |  1 +
 4 files changed, 125 insertions(+)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index eede52c..2bb8ad1 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -1393,6 +1393,67 @@ namespace gbe
     p->pop();
   }
 
+  void ChvContext::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getStackSize() == 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    GBE_ASSERT(perLaneSize > 0);
+
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+    loadLaneID(stackptr);
+
+    // We compute the per-lane stack pointer here
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
+    // let private address start from zero
+    //p->MOV(stackptr, GenRegister::immud(0));
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
+      p->curr.execWidth = 1;
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+      p->curr.execWidth = this->simdWidth;
+      p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+      if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+        const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+          GenRegister::ul8grf(ir::ocl::stackptr) :
+          GenRegister::ul16grf(ir::ocl::stackptr);
+        GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+        GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+        int simdWidth = p->curr.execWidth;
+        if (simdWidth == 16) {
+          // we need do second quarter first, because the dst type is QW,
+          // while the src is DW. If we do first quater first, the 1st
+          // quarter's dst would contain the 2nd quarter's src.
+          p->curr.execWidth = 8;
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+          p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+          p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+        }
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        p->MOV(sp, stackptr);
+        p->MOV(stackptr2, sp);
+      }
+    p->pop();
+  }
+
   /* Init value according to WORKGROUP OP
    * Emit assert is invalid combination operation - datatype */
   static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op)
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index d715cbc..6b75540 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -125,6 +125,7 @@ namespace gbe
     virtual void newSelection(void);
     virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                            GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+    virtual void emitStackPointer(void);
   };
 }
 #endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 8976ede..483b2c3 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -22,6 +22,7 @@
 
 #include "backend/gen9_context.hpp"
 #include "backend/gen_insn_selection.hpp"
+#include "backend/gen_program.hpp"
 
 namespace gbe
 {
@@ -170,6 +171,67 @@ namespace gbe
     p->pop();
   }
 
+  void BxtContext::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getStackSize() == 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    GBE_ASSERT(perLaneSize > 0);
+
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    // borrow block ip as temporary register as we will
+    // initialize block ip latter.
+    const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW);
+    const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD);
+
+    loadLaneID(stackptr);
+
+    // We compute the per-lane stack pointer here
+    // threadId * perThreadSize + laneId*perLaneSize or
+    // (threadId * simdWidth + laneId)*perLaneSize
+    // let private address start from zero
+    //p->MOV(stackptr, GenRegister::immud(0));
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId
+      p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth));  //threadId * simdWidth
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg);  //threadId * simdWidth + laneId, must < 64K
+      p->curr.execWidth = 1;
+      p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize));
+      p->curr.execWidth = this->simdWidth;
+      p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize
+      if (fn.getPointerFamily() == ir::FAMILY_QWORD) {
+        const GenRegister selStatckPtr2 = this->simdWidth == 8 ?
+          GenRegister::ul8grf(ir::ocl::stackptr) :
+          GenRegister::ul16grf(ir::ocl::stackptr);
+        GenRegister stackptr2 = ra->genReg(selStatckPtr2);
+        GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr);
+        int simdWidth = p->curr.execWidth;
+        if (simdWidth == 16) {
+          // we need do second quarter first, because the dst type is QW,
+          // while the src is DW. If we do first quater first, the 1st
+          // quarter's dst would contain the 2nd quarter's src.
+          p->curr.execWidth = 8;
+          p->curr.quarterControl = GEN_COMPRESSION_Q2;
+          p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1));
+          p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1));
+        }
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        p->MOV(sp, stackptr);
+        p->MOV(stackptr2, sp);
+      }
+    p->pop();
+  }
+
   void KblContext::newSelection(void) {
     this->sel = GBE_NEW(SelectionKbl, *this);
   }
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 2f24b56..9977e9a 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -67,6 +67,7 @@ namespace gbe
     virtual void newSelection(void);
     virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                            GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+    virtual void emitStackPointer(void);
   };
   /* This class is used to implement the kabylake
      specific logic for context. */
-- 
2.7.4



More information about the Beignet mailing list