[Beignet] [PATCH V3] GBE: Support 64Bit register spill.

Ruiling Song ruiling.song at intel.com
Fri Feb 14 06:37:34 CET 2014


Now we support DWORD & QWORD register spill/fill.

v2:
  only add poolOffset by 1 when we meet QWord register and poolOffset is 1.

v3:
  allocate reserved register pool unifiedly for src and dst register.
  when it spill a qword register, payload register should be retyped as dword per bottom/top logic.
  put a limit on the scratch space memory size.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_context.cpp        |   45 ++++++++++++++++++----
 backend/src/backend/gen_insn_selection.cpp |   56 +++++++++++++++-------------
 backend/src/backend/gen_reg_allocation.cpp |   17 ++++++---
 src/cl_command_queue_gen7.c                |    3 +-
 4 files changed, 81 insertions(+), 40 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index d72b19b..7a74856 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1662,14 +1662,28 @@ namespace gbe
     GenRegister payload = src;
     payload.nr = header + 1;
     payload.subnr = 0;
+
     GBE_ASSERT(src.subnr == 0);
-    if (payload.nr != src.nr)
-      p->MOV(payload, src);
     uint32_t regType = insn.src(0).type;
     uint32_t size = typeSize(regType);
-    assert(size <= 4);
-    uint32_t regNum = (stride(src.hstride)*size*simdWidth) > 32 ? 2 : 1;
-    this->scratchWrite(msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    uint32_t regSize = stride(src.hstride)*size;
+
+    GBE_ASSERT(regSize == 4 || regSize == 8);
+    if(regSize == 4) {
+      if (payload.nr != src.nr)
+        p->MOV(payload, src);
+      uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+      this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    }
+    else { //size == 8
+      payload.type = GEN_TYPE_UD;
+      GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1);
+      loadBottomHalf(payload, src);
+      uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+      this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      loadTopHalf(payload, src);
+      this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    }
     p->pop();
   }
 
@@ -1680,10 +1694,25 @@ namespace gbe
     uint32_t simdWidth = p->curr.execWidth;
     const uint32_t header = insn.extra.scratchMsgHeader;
     uint32_t size = typeSize(regType);
-    assert(size <= 4);
-    uint32_t regNum = (stride(dst.hstride)*size*simdWidth) > 32 ? 2 : 1;
+    uint32_t regSize = stride(dst.hstride)*size;
+
     const GenRegister msg = GenRegister::ud8grf(header, 0);
-    this->scratchRead(GenRegister::retype(dst, GEN_TYPE_UD), msg, scratchOffset, regNum, regType, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    GenRegister payload = msg;
+    payload.nr = header + 1;
+
+    p->push();
+    assert(regSize == 4 || regSize == 8);
+    if(regSize == 4) {
+      uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+      this->scratchRead(GenRegister::ud8grf(dst.nr, dst.subnr), msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    } else {
+      uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+      this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      storeBottomHalf(dst, payload);
+      this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      storeTopHalf(dst, payload);
+    }
+    p->pop();
   }
 
   //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 54e5ebe..256224b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -670,18 +670,9 @@ namespace gbe
     return vector;
   }
 
-  // FIXME, there is a risk need to be fixed here.
-  // as the instruction we spill here is the gen ir level not the final
-  // single instruction. If it will be translated to multiple instructions
-  // at gen_context stage, and as the destination registers and source registers
-  // may be spilled to the same register based on current implementation,
-  // then the source register may be modified within the final instruction and
-  // may lead to incorrect result.
   bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs,
                                     uint32_t registerPool) {
     GBE_ASSERT(registerPool != 0);
-    const uint32_t dstStart = registerPool + 1;
-    const uint32_t srcStart = registerPool + 1;
 
     for (auto &block : blockList)
       for (auto &insn : block.insnList) {
@@ -693,17 +684,19 @@ namespace gbe
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
         struct RegSlot {
           RegSlot(ir::Register _reg, uint8_t _srcID,
-                  bool _isTmp, uint32_t _addr)
-                 : reg(_reg), srcID(_srcID), isTmpReg(_isTmp), addr(_addr)
+                   uint8_t _poolOffset, bool _isTmp, uint32_t _addr)
+                 : reg(_reg), srcID(_srcID), poolOffset(_poolOffset), isTmpReg(_isTmp), addr(_addr)
           {};
           ir::Register reg;
           union {
             uint8_t srcID;
             uint8_t dstID;
           };
+          uint8_t poolOffset;
           bool isTmpReg;
           int32_t addr;
         };
+        uint8_t poolOffset = 1; // keep one for scratch message header
         vector <struct RegSlot> regSet;
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister selReg = insn.src(srcID);
@@ -712,18 +705,27 @@ namespace gbe
           if(it != spilledRegs.end()
              && selReg.file == GEN_GENERAL_REGISTER_FILE
              && selReg.physical == 0) {
-            struct RegSlot regSlot(reg, srcID,
+            ir::RegisterFamily family = getRegisterFamily(reg);
+            if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+              poolOffset += 1; // qword register fill could not share the scratch read message payload register
+            }
+            struct RegSlot regSlot(reg, srcID, poolOffset,
                                    it->second.isTmpReg,
                                    it->second.addr);
+            if(family == ir::FAMILY_QWORD) {
+              poolOffset += 2;
+            } else {
+              poolOffset += 1;
+            }
             regSet.push_back(regSlot);
           }
         }
 
-        if (regSet.size() > 5)
+        if (poolOffset > 6) {
+          std::cerr << "Instruction (#" << (uint32_t)insn.opcode << ") src too large pooloffset " << (uint32_t)poolOffset << std::endl;
           return false;
-
+        }
         while(!regSet.empty()) {
-          uint32_t scratchID = regSet.size() - 1;
           struct RegSlot regSlot = regSet.back();
           regSet.pop_back();
           const GenRegister selReg = insn.src(regSlot.srcID);
@@ -732,7 +734,7 @@ namespace gbe
             SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
             unspill->state  = GenInstructionState(ctx.getSimdWidth());
             unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
-                                          srcStart + scratchID, 0,
+                                          registerPool + regSlot.poolOffset, 0,
                                           selReg.type, selReg.vstride,
                                           selReg.width, selReg.hstride);
             unspill->extra.scratchOffset = regSlot.addr;
@@ -742,7 +744,7 @@ namespace gbe
 
           GenRegister src = insn.src(regSlot.srcID);
           // change nr/subnr, keep other register settings
-          src.nr = srcStart + scratchID; src.subnr = 0; src.physical = 1;
+          src.nr = registerPool + regSlot.poolOffset; src.subnr = 0; src.physical = 1;
           insn.src(regSlot.srcID) = src;
         };
 
@@ -756,7 +758,6 @@ namespace gbe
           instruction. Thus the registerPool + 1 still contain valid
           data.
          */
-
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const GenRegister selReg = insn.dst(dstID);
           const ir::Register reg = selReg.reg();
@@ -764,18 +765,24 @@ namespace gbe
           if(it != spilledRegs.end()
              && selReg.file == GEN_GENERAL_REGISTER_FILE
              && selReg.physical == 0) {
-            struct RegSlot regSlot(reg, dstID,
+            ir::RegisterFamily family = getRegisterFamily(reg);
+            if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+              poolOffset += 1; // qword register spill could not share the scratch write message payload register
+            }
+            struct RegSlot regSlot(reg, dstID, poolOffset,
                                    it->second.isTmpReg,
                                    it->second.addr);
+            if(family == ir::FAMILY_QWORD) poolOffset +=2;
+            else poolOffset += 1;
             regSet.push_back(regSlot);
           }
         }
 
-        if (regSet.size() > 5)
+        if (poolOffset > 6){
+          std::cerr << "Instruction (#" << (uint32_t)insn.opcode << ") dst too large pooloffset " << (uint32_t)poolOffset << std::endl;
           return false;
-
+        }
         while(!regSet.empty()) {
-          uint32_t scratchID = regSet.size() - 1;
           struct RegSlot regSlot = regSet.back();
           regSet.pop_back();
           const GenRegister selReg = insn.dst(regSlot.dstID);
@@ -784,7 +791,7 @@ namespace gbe
             SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
             spill->state  = GenInstructionState(ctx.getSimdWidth());
             spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
-                                        dstStart + scratchID, 0,
+                                        registerPool + regSlot.poolOffset, 0,
                                         selReg.type, selReg.vstride,
                                         selReg.width, selReg.hstride);
             spill->extra.scratchOffset = regSlot.addr;
@@ -794,9 +801,8 @@ namespace gbe
 
           GenRegister dst = insn.dst(regSlot.dstID);
           // change nr/subnr, keep other register settings
-          dst.physical =1; dst.nr = dstStart + scratchID; dst.subnr = 0;
+          dst.physical =1; dst.nr = registerPool + regSlot.poolOffset; dst.subnr = 0;
           insn.dst(regSlot.dstID)= dst;
-          scratchID++;
         }
       }
     return true;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 2aafdb1..b94b809 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -234,7 +234,7 @@ namespace gbe
     uint32_t grfOffset = allocateReg(interval, regSize, regSize);
     if (grfOffset == 0) {
       /* this register is going to be spilled. */
-      GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
+      GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD));
       return false;
     }
     insertNewReg(reg, grfOffset);
@@ -617,7 +617,8 @@ namespace gbe
        ir::RegisterFamily family;
        getRegAttrib(reg, regSize, &family);
 
-       if (regSize == GEN_REG_SIZE && family == ir::FAMILY_DWORD /*&& !isVector*/) {
+       if ((regSize == GEN_REG_SIZE && family == ir::FAMILY_DWORD)
+          || (regSize == 2*GEN_REG_SIZE && family == ir::FAMILY_QWORD)) {
          GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
          offsetReg.insert(std::make_pair(grfOffset, reg));
          spillCandidate.insert(intervals[reg]);
@@ -639,7 +640,8 @@ namespace gbe
     if (!spillTag.isTmpReg) {
       // FIXME, we can optimize scratch allocation according to
       // the interval information.
-      spillTag.addr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
+      spillTag.addr = ctx.allocateScratchMem(getFamilySize(family)
                                              * ctx.getSimdWidth());
     } else
       spillTag.addr = -1;
@@ -682,6 +684,7 @@ namespace gbe
       auto vectorIt = vectorMap.find(reg);
       bool isVector = vectorIt != vectorMap.end();
       bool needRestart = false;
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(reg);
       if (isVector
           && (vectorCanSpill(vectorIt->second.first))) {
         const SelectionVector *vector = vectorIt->second.first;
@@ -690,11 +693,12 @@ namespace gbe
                      == spilledRegs.end());
           spillSet.insert(vector->reg[id].reg());
           reg = vector->reg[id].reg();
-          size -= GEN_REG_SIZE;
+          family = ctx.sel->getRegisterFamily(reg);
+          size -= family == ir::FAMILY_QWORD ? 2*GEN_REG_SIZE : GEN_REG_SIZE;
         }
       } else if (!isVector) {
         spillSet.insert(reg);
-        size -= GEN_REG_SIZE;
+        size -= family == ir::FAMILY_QWORD ? 2*GEN_REG_SIZE : GEN_REG_SIZE;
       } else
         needRestart = true; // is a vector which could not be spilled.
 
@@ -702,7 +706,8 @@ namespace gbe
         break;
       if (!needRestart) {
         uint32_t offset = RA.find(reg)->second;
-        auto nextRegIt = offsetReg.find(offset + GEN_REG_SIZE);
+        uint32_t nextOffset = (family == ir::FAMILY_QWORD) ? (offset + 2*GEN_REG_SIZE) : (offset + GEN_REG_SIZE);
+        auto nextRegIt = offsetReg.find(nextOffset);
         if (nextRegIt != offsetReg.end())
           reg = nextRegIt->second;
         else
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index ba69589..975edc6 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -259,7 +259,8 @@ static void
 cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker)
 {
   int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque);
-
+  /* Per HW Spec, it only allows 12KB scratch memory per HW thread now */
+  assert(scratch_sze < 12*1024);
   cl_gpgpu_set_scratch(gpgpu, scratch_sz);
 }
 
-- 
1.7.9.5



More information about the Beignet mailing list