[Beignet] [PATCH 02/10 OpenCL-2.0] Add threadid and threadn register into curbe.

Wed Apr 22 20:25:40 PDT 2015

From: Junyan He <junyan.he at linux.intel.com>

threadn will represent the total threads within one
work group while threadid represent which thread it
is. threadid will have strong relationship with local
ids. We caculate them before NDRange to save gpu insn.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp   |   28 +++++++++++++++-------------
 backend/src/backend/program.h         |    1 +
 backend/src/ir/profile.cpp            |    2 ++
 backend/src/ir/profile.hpp            |    5 +++--
 backend/src/ir/register.cpp           |    2 ++
 backend/src/llvm/llvm_gen_backend.cpp |    9 ++++++---
 src/cl_command_queue_gen7.c           |   12 +++++++++++-
 7 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index cdf581c..a9663d7 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2023,20 +2023,20 @@ namespace gbe
     } else
   
     fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        const uint32_t srcNum = insn.getSrcNum();
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
         const ir::Register reg = insn.getSrc(srcID);
         if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
-          }
-          continue;
+        if (srcID != 0) continue;
+        const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+        const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+        ir::ImageInfoKey key(bti, type);
+        const ir::Register imageInfo = insn.getSrc(0);
+        if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+        uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+        insertCurbeReg(imageInfo, offset);
+        }
+        continue;
         }
         if (fn.isSpecialReg(reg) == false) continue;
         if (curbeRegs.find(reg) != curbeRegs.end()) continue;
@@ -2057,8 +2057,10 @@ namespace gbe
         INSERT_REG(stackptr, STACK_POINTER)
         INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
         INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+        INSERT_REG(threadn, THREAD_NUM)
+        INSERT_REG(threadid, THREAD_ID)
         do {} while(0);
-      }
+        }
     });
 #undef INSERT_REG
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index dc5662f..9b08ae6 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -100,6 +100,7 @@ enum gbe_curbe_type {
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
   GBE_CURBE_SLM_OFFSET,
+  GBE_CURBE_THREAD_ID
 };
 
 /*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 4c272bd..b8fbf4a 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,6 +43,7 @@ namespace ir {
         "zero", "one",
         "retVal", "slm_offset",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "threadid",
         "invalid"
     };
 
@@ -86,6 +87,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, threadid, 1);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 7259d9f..cc8336a 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,8 +71,9 @@ namespace ir {
     static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
     static const Register printfbptr = Register(28); // printf buffer address .
     static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register invalid = Register(30);  // used for valid comparation.
-    static const uint32_t regNum = 31;             // number of special registers
+    static const Register threadid = Register(30); // the thread id of this thread.
+    static const Register invalid = Register(31);  // used for valid comparation.
+    static const uint32_t regNum = 32;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 48d6875..0041f9d 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -46,6 +46,8 @@ namespace ir {
     for (uint32_t i = 0; i < file.regNum(); ++i) {
       const RegisterData reg = file.get(Register(i));
       out << ".decl." << reg << " %" << i;
+      if (reg.isUniform())
+        out << "(uniform)";
       if (i < ocl::regNum)
         out << " " << ocl::specialRegMean[i];
       out << std::endl;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index ff60d86..47a9a63 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2897,9 +2897,12 @@ namespace gbe
       ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST, (uint32_t)wgBroadcastSLM, getRegister(&I), srcTuple, argNum,
           getType(ctx, (*AI)->getType()));
     } else {
-      const ir::Register src = this->getRegister(*(AI++));
-      const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1);
-      ctx.WORKGROUP(opcode, (uint32_t)tidMapSLM, getRegister(&I), srcTuple, 1, getType(ctx, (*AI)->getType()));
+      ir::Register src[3];
+      src[0] = ir::ocl::threadn;
+      src[1] = ir::ocl::threadid;
+      src[2] = this->getRegister(*(AI++));
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
+      ctx.WORKGROUP(opcode, (uint32_t)tidMapSLM, getRegister(&I), srcTuple, 3, getType(ctx, (*AI)->getType()));
     }
 
     GBE_ASSERT(AI == AE);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 253c4f2..e115463 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -46,14 +46,16 @@ cl_set_varying_payload(const cl_kernel ker,
 {
   uint32_t *ids[3] = {NULL,NULL,NULL};
   uint16_t *block_ips = NULL;
+  uint32_t *thread_ids = NULL;
   size_t i, j, k, curr = 0;
-  int32_t id_offset[3], ip_offset;
+  int32_t id_offset[3], ip_offset, tid_offset;
   cl_int err = CL_SUCCESS;
 
   id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
   id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
   id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
   ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
+  tid_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_THREAD_ID, 0);
   assert(id_offset[0] >= 0 &&
          id_offset[1] >= 0 &&
          id_offset[2] >= 0 &&
@@ -63,6 +65,8 @@ cl_set_varying_payload(const cl_kernel ker,
   TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
+  if (tid_offset >= 0)
+    TRY_ALLOC(thread_ids, (uint32_t*) alloca(sizeof(uint32_t)*thread_n));
 
   /* 0xffff means that the lane is inactivated */
   memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz);
@@ -75,6 +79,8 @@ cl_set_varying_payload(const cl_kernel ker,
     ids[1][curr] = j;
     ids[2][curr] = k;
     block_ips[curr] = 0;
+    if (thread_ids)
+      thread_ids[curr/simd_sz] = (k*local_wk_sz[2] + j*local_wk_sz[1] + i)/simd_sz;
   }
 
   /* Copy them to the curbe buffer */
@@ -84,6 +90,10 @@ cl_set_varying_payload(const cl_kernel ker,
     uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
     uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
     uint16_t *ips  = (uint16_t *) (data + ip_offset);
+
+    if (thread_ids)
+      *(uint32_t *)(data + tid_offset) = thread_ids[i];
+
     for (j = 0; j < simd_sz; ++j, ++curr) {
       ids0[j] = ids[0][curr];
       ids1[j] = ids[1][curr];
-- 
1.7.9.5