[Beignet] [PATCH 10/10] Backend: Initial support for long/ulong types in workgroup ops

Thu Mar 31 15:28:40 UTC 2016

From: Grigore Lupescu <grigore.lupescu at intel.com>

Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
---
 backend/src/backend/gen_context.cpp        | 71 ++++++++++++++++++++----------
 backend/src/backend/gen_insn_selection.cpp |  6 +--
 2 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 31232dd..c5c27c6 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2663,22 +2663,27 @@ namespace gbe
         p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid)));
     }
 
-    /* TODO implement communication for DW types */
-    if(dst.type == GEN_TYPE_UL ||
-        dst.type == GEN_TYPE_L ||
-        dst.type == GEN_TYPE_DF_IMM)
+    /* All threads write the partial results to SLM memory */
+    if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
     {
-      p->curr.execWidth = 16;
-      p->MOV(dst, threadData);
-      return;
-    }
+      GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D);
+      GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4);
+      p->MOV(msgData.offset(msgData, 0), threadDataL);
+      p->MOV(msgData.offset(msgData, 1), threadDataH);
 
-    /* All threads write the partial results to SLM memory */
-    p->curr.execWidth = 8;
-    p->MOV(msgData, threadData);
-    p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
-    p->ADD(msgAddr, msgAddr, msgSlmOff);
-    p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+      p->curr.execWidth = 8;
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x8));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2);
+    }
+    else
+    {
+      p->curr.execWidth = 8;
+      p->MOV(msgData, threadData);
+      p->MUL(msgAddr, threadId, GenRegister::immd(0x4));
+      p->ADD(msgAddr, msgAddr, msgSlmOff);
+      p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1);
+    }
 
     /* Init partialData register, it will hold the final result */
     initValue(p, partialData, wg_op);
@@ -2692,17 +2697,37 @@ namespace gbe
     p->push();{
       jip0 = p->n_instruction();
 
-      p->curr.execWidth = 8;
-      p->curr.predicate = GEN_PREDICATE_NONE;
-
       /* Read in chunks of 4 to optimize SLM reads and reduce SEND messages */
-      p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
-      p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
-      p->ADD(msgAddr, msgAddr, msgSlmOff);
-      p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
+      if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L)
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x8));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 2);
+
+        GenRegister msgDataL = msgData.retype(msgData.offset(msgData, 0, 4), GEN_TYPE_D);
+        GenRegister msgDataH = msgData.retype(msgData.offset(msgData, 1, 4), GEN_TYPE_D);
+        msgDataL.hstride = 2;
+        msgDataH.hstride = 2;
+        p->MOV(msgDataL, msgDataH);
+
+        /* Perform operation, partialData will hold result */
+        workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
+      else
+      {
+        p->curr.execWidth = 8;
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->ADD(threadLoop, threadLoop, GenRegister::immd(-1));
+        p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4));
+        p->ADD(msgAddr, msgAddr, msgSlmOff);
+        p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1);
 
-      /* Perform operation, process 4 elements, partialData will hold result */
-      workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+        /* Perform operation, partialData will hold result */
+        workgroupOp(partialData, partialData, msgData.offset(msgData, 0), wg_op, p);
+      }
 
       /* While threadN is not 0, cycle read SLM / update value */
       p->curr.noMask = 1;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 12a0cf4..3fe0465 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -6462,9 +6462,9 @@ namespace gbe
       GBE_ASSERT(srcNum == 3);
       GBE_ASSERT(insn.getSrc(0) == ir::ocl::threadn);
       GBE_ASSERT(insn.getSrc(1) == ir::ocl::threadid);
-      GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), type);
-      GenRegister data = sel.selReg(sel.reg(FAMILY_DWORD), type);
-      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+      GenRegister tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_QWORD)), type);
+      GenRegister data = sel.selReg(sel.reg(FAMILY_QWORD), type);
+      GenRegister slmOff = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U32);
 
       vector<GenRegister> msg;
       for(uint32_t i = 0; i < 6; i++)
-- 
2.5.0