[Beignet] [PATCH 1/2] GBE: Refine uniform load logic.

Ruiling Song ruiling.song at intel.com
Fri Jul 11 01:23:07 PDT 2014


Currently many dataport messages cannot support uniform
load/store. So we need to add a move instruction before
or after a dataport message to move data between uniform
and varying registers. The idea behind the patch is that
if a dataport message could not handle uniform register,
then explicitly handle uniform register before the message.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp |  174 ++++++++++++++++++++--------
 backend/src/backend/gen_reg_allocation.cpp |    2 +
 2 files changed, 127 insertions(+), 49 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index fb041de..565f203 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -904,7 +904,7 @@ namespace gbe
 
   ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
     SelectionBlock *block = insn->parent;
-    const uint32_t simdWidth = insn->state.execWidth;
+    const uint32_t simdWidth =this->isScalarReg(insn->src(regID).reg()) ? 1 : insn->state.execWidth;
     ir::Register tmp;
     GenRegister gr;
 
@@ -939,7 +939,7 @@ namespace gbe
     ir::Register tmp;
     GenRegister gr;
     this->block = block;
-    tmp = this->reg(ir::getFamily(type));
+    tmp = this->reg(ir::getFamily(type),simdWidth == 1);
     gr = this->selReg(tmp, type);
     if (needMov) {
     // Generate the MOV instruction and replace the register in the instruction
@@ -2693,6 +2693,29 @@ namespace gbe
     }
   }
 
+  GenRegister replaceUniformSource(Selection::Opaque &sel, ir::Register reg, ir::Type type) {
+    const bool isUniform = sel.isScalarReg(reg);
+    ir::RegisterFamily family = ir::getFamily(type);
+    const GenRegister uniform = sel.selReg(reg, type);
+    if(isUniform) {
+      const GenRegister varying = sel.selReg(sel.reg(family), type);
+      sel.push();
+        sel.curr.noMask = 1;
+        sel.MOV(varying, uniform);
+      sel.pop();
+      return varying;
+    }
+    return uniform;
+  }
+
+  void moveVarying2Uniform(Selection::Opaque &sel, GenRegister uniform, GenRegister varying) {
+    sel.push();
+      sel.curr.noMask = 1;
+      sel.curr.execWidth = 1;
+      sel.MOV(uniform, varying);
+    sel.pop();
+  }
+
   /*! Load instruction pattern */
   DECL_PATTERN(LoadInstruction)
   {
@@ -2704,9 +2727,31 @@ namespace gbe
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
-      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
-        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
+      bool isUniformDst = sel.isScalarReg(insn.getValue(0));
+      GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+
+      if(isUniformDst) {
+        for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+          dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_FLOAT);
+      } else {
+        for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+          dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_FLOAT);
+      }
+
+      sel.push();
+        if(isUniformDst)
+          sel.curr.noMask = 1;
+        sel.UNTYPED_READ(tmpAddr, dst.data(), valueNum, bti);
+      sel.pop();
+
+      if(isUniformDst) {
+        sel.push();
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
+          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+            sel.MOV(sel.selReg(insn.getValue(dstID), ir::TYPE_FLOAT), GenRegister::vec1(dst[dstID]));
+        sel.pop();
+      }
     }
 
     void emitDWordGather(Selection::Opaque &sel,
@@ -2715,21 +2760,32 @@ namespace gbe
                          uint32_t bti) const
     {
       using namespace ir;
-      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
+      const bool isUniformDst = sel.isScalarReg(insn.getValue(0));
+      const bool isUniformSrc = sel.isScalarReg(addr.reg());
+
       GBE_ASSERT(insn.getValueNum() == 1);
       GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
-      // get dword based address
-      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
+      // Get dword based address
+      GenRegister addrDW = GenRegister::udxgrf(sel.ctx.getSimdWidth(), sel.reg(FAMILY_DWORD));
 
       sel.push();
-        if (simdWidth == 1) {
+        if (isUniformSrc) {
           sel.curr.noMask = 1;
-          sel.curr.execWidth = 1;
         }
         sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
       sel.pop();
 
-      sel.DWORD_GATHER(dst, addrDW, bti);
+      if(isUniformDst) {
+        GenRegister tmp = GenRegister::fxgrf(sel.ctx.getSimdWidth(), sel.reg(FAMILY_DWORD));
+        sel.push();
+          sel.curr.noMask = 1;
+          sel.DWORD_GATHER(tmp, addrDW, bti);
+          sel.curr.execWidth = 1;
+          sel.MOV(dst, GenRegister::vec1(tmp));
+        sel.pop();
+      } else
+        sel.DWORD_GATHER(dst, addrDW, bti);
+
     }
 
     void emitRead64(Selection::Opaque &sel,
@@ -2741,24 +2797,29 @@ namespace gbe
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
+      GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
 
+      /// TODO support uniform load for int64
+      GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
       GenRegister dst[valueNum];
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
-      sel.READ64(addr, dst, valueNum, bti);
+      sel.READ64(tmpAddr, dst, valueNum, bti);
     }
 
     void emitByteGather(Selection::Opaque &sel,
                         const ir::LoadInstruction &insn,
                         const uint32_t elemSize,
-                        GenRegister address,
+                        GenRegister addr,
                         uint32_t bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
-                                 1 : sel.ctx.getSimdWidth();
+      const bool isUniformDst = sel.isScalarReg(insn.getValue(0));
+
       if(valueNum > 1) {
+        GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+
         vector<GenRegister> dst(valueNum);
         const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
 
@@ -2773,10 +2834,13 @@ namespace gbe
         uint32_t tmpRegNum = typeSize*valueNum / 4;
         vector<GenRegister> tmp(tmpRegNum);
         for(uint32_t i = 0; i < tmpRegNum; i++) {
-          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
         }
-
-        sel.UNTYPED_READ(address, tmp.data(), tmpRegNum, bti);
+        sel.push();
+          if(isUniformDst)
+            sel.curr.noMask = 1;
+          sel.UNTYPED_READ(tmpAddr, tmp.data(), tmpRegNum, bti);
+        sel.pop();
 
         for(uint32_t i = 0; i < tmpRegNum; i++) {
           sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
@@ -2784,35 +2848,39 @@ namespace gbe
      } else {
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister value = sel.selReg(insn.getValue(0));
+        const bool isUniformAddr = sel.isScalarReg(addr.reg());
         GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
 
-        Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
-        GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
-        GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
+        Register tmpReg = sel.reg(FAMILY_DWORD);
+        GenRegister tmpAddr = GenRegister::udxgrf(sel.ctx.getSimdWidth(), sel.reg(FAMILY_DWORD));
+        GenRegister tmpData = GenRegister::udxgrf(sel.ctx.getSimdWidth(), tmpReg);
         // Get dword aligned addr
         sel.push();
-          if (simdWidth == 1) {
-            sel.curr.execWidth = 1;
+          if (isUniformAddr)
             sel.curr.noMask = 1;
-          }
-          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+          sel.AND(tmpAddr, addr, GenRegister::immud(0xfffffffc));
         sel.pop();
+
         sel.push();
-          if (simdWidth == 1)
+          if (isUniformDst)
             sel.curr.noMask = 1;
           sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
 
-          if (simdWidth == 1)
+          if (isUniformDst) {
             sel.curr.execWidth = 1;
+            addr = GenRegister::vec1(addr);
+            tmpAddr = GenRegister::vec1(tmpAddr);
+            tmpData = GenRegister::vec1(tmpData);
+          }
           // Get the remaining offset from aligned addr
-          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+          sel.AND(tmpAddr, addr, GenRegister::immud(0x3));
           sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
           sel.SHR(tmpData, tmpData, tmpAddr);
 
           if (elemSize == GEN_BYTE_SCATTER_WORD)
-            sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+            sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(tmpReg, isUniformDst));
           else if (elemSize == GEN_BYTE_SCATTER_BYTE)
-            sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+            sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(tmpReg, isUniformDst));
         sel.pop();
       }
     }
@@ -2837,7 +2905,7 @@ namespace gbe
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
                  insn.getAddressSpace() == MEM_LOCAL);
-      //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
       if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
@@ -2880,11 +2948,12 @@ namespace gbe
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> value(valueNum);
+      GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
 
-      addr = GenRegister::retype(addr, GEN_TYPE_F);
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
-      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+        value[valueID] = replaceUniformSource(sel, insn.getValue(valueID), ir::TYPE_FLOAT);
+
+      sel.UNTYPED_WRITE(tmpAddr, value.data(), valueNum, bti);
     }
 
     void emitWrite64(Selection::Opaque &sel,
@@ -2896,12 +2965,12 @@ namespace gbe
       const uint32_t valueNum = insn.getValueNum();
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
-      addr = GenRegister::retype(addr, GEN_TYPE_UD);
       GenRegister src[valueNum];
+      GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), TYPE_U32);
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.WRITE64(addr, src, valueNum, bti);
+        src[valueID] = replaceUniformSource(sel, insn.getValue(valueID), ir::TYPE_U64);
+      sel.WRITE64(tmpAddr, src, valueNum, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
@@ -2914,6 +2983,8 @@ namespace gbe
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
       uint32_t valueNum = insn.getValueNum();
 
+      GenRegister tmpAddr = replaceUniformSource(sel, addr.reg(), ir::TYPE_FLOAT);
+
       if(valueNum > 1) {
         const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
         vector<GenRegister> value(valueNum);
@@ -2933,7 +3004,7 @@ namespace gbe
           sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
         }
 
-        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+        sel.UNTYPED_WRITE(tmpAddr, tmp.data(), tmpRegNum, bti);
       } else {
         const GenRegister value = sel.selReg(insn.getValue(0));
         GBE_ASSERT(insn.getValueNum() == 1);
@@ -2943,7 +3014,7 @@ namespace gbe
         } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
           sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
         }
-        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+        sel.BYTE_SCATTER(tmpAddr, tmp, elemSize, bti);
       }
     }
 
@@ -3367,18 +3438,23 @@ namespace gbe
       const AddressSpace space = insn.getAddressSpace();
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const uint32_t srcNum = insn.getSrcNum();
-      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
+      GenRegister src0 = replaceUniformSource(sel, insn.getSrc(0), TYPE_U32); //address
       GenRegister src1 = src0, src2 = src0;
-      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
-      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+      if(srcNum > 1) src1 = replaceUniformSource(sel, insn.getSrc(1), TYPE_U32);
+      if(srcNum > 2) src2 = replaceUniformSource(sel, insn.getSrc(2), TYPE_U32);
+
       GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
+      const bool isUniformDst = sel.isScalarReg(insn.getDst(0));
+      GenRegister tmpDst = isUniformDst ? sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32) : dst;
+
       GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
       if(space == MEM_LOCAL && sel.needPatchSLMAddr()){
         GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
         src0 = temp;
       }
-      sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, bti);
+      sel.ATOMIC(tmpDst, genAtomicOp, srcNum, src0, src1, src2, bti);
+      if(isUniformDst) moveVarying2Uniform(sel, dst, tmpDst);
       return true;
     }
     DECL_CTOR(AtomicInstruction, 1, 1);
@@ -3589,12 +3665,12 @@ namespace gbe
       if (insn.getSamplerOffset() != 0) {
         // U, lod, [V], [W]
         GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
-        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+        msgPayloads[0] = replaceUniformSource(sel, insn.getSrc(0), insn.getSrcType());
         msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         if (srcNum > 1)
-          msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+          msgPayloads[2] = replaceUniformSource(sel, insn.getSrc(1), insn.getSrcType());
         if (srcNum > 2)
-          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+          msgPayloads[3] = replaceUniformSource(sel, insn.getSrc(2), insn.getSrcType());
         // Clear the lod to zero.
         sel.MOV(msgPayloads[1], GenRegister::immud(0));
         msgLen = srcNum + 1;
@@ -3602,7 +3678,7 @@ namespace gbe
         // U, V, [W]
         GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
         for (valueID = 0; valueID < srcNum; ++valueID)
-          msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+          msgPayloads[valueID] = replaceUniformSource(sel, insn.getSrc(valueID), insn.getSrcType());
         msgLen = srcNum;
       }
       // We switch to a fixup bti for linear filter on a image1d array sampling.
@@ -3637,7 +3713,7 @@ namespace gbe
         uint32_t valueID = 0;
         msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
-          msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+          msgs[msgID] = replaceUniformSource(sel, insn.getSrc(msgID - 1), insn.getCoordType());
 
         // fake u.
         if (insn.getSrc(1) == ir::ocl::invalid)
@@ -3648,7 +3724,7 @@ namespace gbe
         // LOD.
         msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
-          msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+          msgs[msgID] = replaceUniformSource(sel, insn.getSrc(valueID), insn.getSrcType());
       }
 
       sel.push();
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index b7fbc93..e1e5b93 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -315,6 +315,8 @@ namespace gbe
       else {
         ir::Register tmp;
         ir::Type type = getIRType(vector->reg[regID].type);
+        // We currently don't have scalar register in SelectionVector
+        GBE_ASSERT(ctx.sel->isScalarReg(reg) == false);
         tmp = this->replaceReg(selection, vector->insn, regID, vector->isSrc, type);
         const VectorLocation location = std::make_pair(vector, regID);
         this->vectorMap.insert(std::make_pair(tmp, location));
-- 
1.7.10.4



More information about the Beignet mailing list