[Beignet] [PATCH v2 1/2] Support 64-bit float

Homer Hsing homer.xing at intel.com
Thu Jun 6 18:59:41 PDT 2013


support 64-bit arithmetic, store, load, and immediate value

example:

  kernel void f(global double *src, global double *dst) {
    int i = get_global_id(0);
    double d = 1.234567890123456789;
    dst[i] = d * (src[i] + d);
  }

Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
 backend/src/backend/gen_context.cpp        |  16 ++++-
 backend/src/backend/gen_defs.hpp           |   1 +
 backend/src/backend/gen_encoder.cpp        | 107 +++++++++++++++++++++++++++--
 backend/src/backend/gen_encoder.hpp        |   6 +-
 backend/src/backend/gen_insn_selection.cpp |  46 +++++++++----
 backend/src/backend/gen_insn_selection.hpp |   4 ++
 backend/src/backend/gen_reg_allocation.cpp |  10 ++-
 backend/src/backend/gen_register.hpp       |  80 +++++++++++++++++++++
 backend/src/ir/profile.cpp                 |   2 +
 backend/src/ir/profile.hpp                 |   3 +-
 10 files changed, 249 insertions(+), 26 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 055c8fc..e6080b2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -138,7 +138,15 @@ namespace gbe
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
     switch (insn.opcode) {
-      case SEL_OP_MOV: p->MOV(dst, src); break;
+      case SEL_OP_MOV:
+        if (dst.isdf() && !src.isdf()) {
+          bool doubleio = true;
+          const GenRegister r = ra->genReg(GenRegister::f16grf(ir::ocl::doubleio));
+          p->MOV(dst, src, doubleio, r);
+        } else {
+          p->MOV(dst, src);
+        }
+        break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
       case SEL_OP_RNDD: p->RNDD(dst, src); break;
       case SEL_OP_RNDU: p->RNDU(dst, src); break;
@@ -263,14 +271,16 @@ namespace gbe
     const GenRegister src = ra->genReg(insn.src(0));
     const uint32_t bti = insn.extra.function;
     const uint32_t elemNum = insn.extra.elem;
-    p->UNTYPED_READ(dst, src, bti, elemNum);
+    const GenRegister r = ra->genReg(GenRegister::ud16grf(ir::ocl::doubleio));
+    p->UNTYPED_READ(dst, src, bti, elemNum, insn.doubleio, r);
   }
 
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     const uint32_t bti = insn.extra.function;
     const uint32_t elemNum = insn.extra.elem;
-    p->UNTYPED_WRITE(src, bti, elemNum);
+    const GenRegister r = ra->genReg(GenRegister::ud16grf(ir::ocl::doubleio));
+    p->UNTYPED_WRITE(src, bti, elemNum, insn.doubleio, r);
   }
 
   void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index c7a1581..63f98f5 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -215,6 +215,7 @@ enum GenMessageTarget {
 #define GEN_TYPE_VF  5 /* packed float vector, immediates only? */
 #define GEN_TYPE_HF  6
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
 
 #define GEN_ARF_NULL                  0x00
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index b65cc94..db08e90 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -355,7 +355,29 @@ namespace gbe
     0
   };
 
-  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum, bool doubleio, GenRegister r) {
+    if (doubleio) {
+      GenRegister hdr = GenRegister::h2(r);
+      GenRegister imm4 = GenRegister::immud(4);
+      push();
+        curr.execWidth = 8;
+        MOV(hdr,                            GenRegister::ud8grf(src.nr, 0));
+        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(src.nr, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      pop();
+      UNTYPED_READ(dst, hdr, bti, elemNum, false, r);
+
+      push();
+        curr.execWidth = 8;
+        MOV(hdr,                            GenRegister::ud8grf(src.nr + 1, 0));
+        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(src.nr + 1, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      pop();
+      UNTYPED_READ(GenRegister::offset(dst, 2), hdr, bti, elemNum, false, r);
+      return;
+    }
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
     uint32_t msg_length = 0;
@@ -382,7 +404,32 @@ namespace gbe
                    response_length);
   }
 
-  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum, bool doubleio, GenRegister r) {
+    if (doubleio) {
+      GenRegister hdr = GenRegister::h2(r);
+      GenRegister data = GenRegister::offset(r, 2);
+      GenRegister imm4 = GenRegister::immud(4);
+      push();
+        curr.execWidth = 8;
+        MOV(hdr,                            GenRegister::ud8grf(msg.nr, 0));
+        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(msg.nr, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      pop();
+      MOV(data, GenRegister::ud16grf(msg.nr+2, 0));
+      UNTYPED_WRITE(hdr, bti, elemNum, false, r);
+
+      push();
+        curr.execWidth = 8;
+        MOV(hdr,                            GenRegister::ud8grf(msg.nr+1, 0));
+        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(msg.nr+1, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      pop();
+      MOV(data, GenRegister::ud16grf(msg.nr+4, 0));
+      UNTYPED_WRITE(hdr, bti, elemNum, false, r);
+      return;
+    }
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
     uint32_t msg_length = 0;
@@ -467,7 +514,17 @@ namespace gbe
   }
 
   INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
-     if (needToSplitAlu1(p, dst, src) == false) {
+     if (opcode != GEN_OPCODE_MOV && dst.isdf() && src.isdf()) {
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src);
+       int w = p->curr.execWidth / 4;
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::QnPhysical(dst, w));
+       p->setSrc0(insn, GenRegister::QnPhysical(src, w));
+     } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -499,7 +556,19 @@ namespace gbe
                    GenRegister src0,
                    GenRegister src1)
   {
-    if (needToSplitAlu2(p, dst, src0, src1) == false) {
+    if (dst.isdf() && src0.isdf() && src1.isdf()) {
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src0);
+       p->setSrc1(insn, src1);
+       int w = p->curr.execWidth / 4;
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::QnPhysical(dst, w));
+       p->setSrc0(insn, GenRegister::QnPhysical(src0, w));
+       p->setSrc1(insn, GenRegister::QnPhysical(src1, w));
+    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -620,7 +689,35 @@ namespace gbe
     alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
   }
 
-  ALU1(MOV)
+  void GenEncoder::MOV(GenRegister dest, GenRegister src0, bool doubleio, GenRegister r) {
+    if (doubleio) {
+      GenRegister r0 = GenRegister::h2(r);
+      push();
+        curr.execWidth = 8;
+        if(src0.isimmdf()) {
+          union { double d; unsigned u[2]; } u;
+          u.d = src0.value.df;
+          r0 = GenRegister::retype(r0, GEN_TYPE_UD);
+          GenRegister imm0 = GenRegister::immud(u.u[0]);
+          GenRegister imm1 = GenRegister::immud(u.u[1]);
+          MOV(r0, imm0);
+          MOV(GenRegister::suboffset(r0, 1), imm1);
+          MOV(GenRegister::offset(r0, 1), imm0);
+          MOV(GenRegister::suboffset(GenRegister::offset(r0, 1), 1), imm1);
+        } else {
+          MOV(r0, src0);
+          MOV(GenRegister::offset(r0, 1), GenRegister::offset(src0, 0, 16));
+        }
+      pop();
+      if(src0.isimmdf())
+        r = GenRegister::retype(r, GEN_TYPE_DF);
+      MOV(dest, r);
+      MOV(GenRegister::offset(dest, 2), r);
+    } else {
+      alu1(this, GEN_OPCODE_MOV, dest, src0);
+    }
+  }
+
   ALU1(RNDZ)
   ALU1(RNDE)
   ALU1(RNDD)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 83d83d2..3dc55b6 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -89,7 +89,7 @@ namespace gbe
 #define ALU1(OP) void OP(GenRegister dest, GenRegister src0);
 #define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1);
 #define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2);
-    ALU1(MOV)
+    void MOV(GenRegister dest, GenRegister src0, bool doubleio = false, GenRegister r = GenRegister());
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU1(RNDD)
@@ -131,9 +131,9 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Untyped read (upto 4 channels) */
-    void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum, bool doubleio, GenRegister r);
     /*! Untyped write (upto 4 channels) */
-    void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum, bool doubleio, GenRegister r);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
     void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 88f9e94..6a845cf 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -129,6 +129,7 @@ namespace gbe
       case TYPE_S32: return GEN_TYPE_D;
       case TYPE_U32: return GEN_TYPE_UD;
       case TYPE_FLOAT: return GEN_TYPE_F;
+      case TYPE_DOUBLE: return GEN_TYPE_DF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
     }
   }
@@ -151,7 +152,7 @@ namespace gbe
   ///////////////////////////////////////////////////////////////////////////
 
   SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
-    parent(NULL), opcode(op), dstNum(dst), srcNum(src)
+    parent(NULL), opcode(op), dstNum(dst), srcNum(src), doubleio(false)
   {}
 
   void SelectionInstruction::prepend(SelectionInstruction &other) {
@@ -448,9 +449,9 @@ namespace gbe
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
     /*! Untyped read (up to 4 elements) */
-    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti, bool doubleio);
     /*! Untyped write (up to 4 elements) */
-    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti, bool doubleio);
     /*! Byte gather (for unaligned bytes, shorts and ints) */
     void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
     /*! Byte scatter (for unaligned bytes, shorts and ints) */
@@ -655,6 +656,7 @@ namespace gbe
       case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
       default: NOT_SUPPORTED;
     }
     GBE_ASSERT(false);
@@ -715,7 +717,8 @@ namespace gbe
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
-                                       uint32_t bti)
+                                       uint32_t bti,
+                                       bool doubleio)
   {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
     SelectionVector *srcVector = this->appendVector();
@@ -727,11 +730,13 @@ namespace gbe
     insn->src(0) = addr;
     insn->extra.function = bti;
     insn->extra.elem = elemNum;
+    insn->doubleio = doubleio;
 
     // Sends require contiguous allocation
     dstVector->regNum = elemNum;
     dstVector->isSrc = 0;
     dstVector->reg = &insn->dst(0);
+    dstVector->doubleio = doubleio;
 
     // Source cannot be scalar (yet)
     srcVector->regNum = 1;
@@ -742,7 +747,8 @@ namespace gbe
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
-                                        uint32_t bti)
+                                        uint32_t bti,
+                                        bool doubleio)
   {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
     SelectionVector *vector = this->appendVector();
@@ -753,11 +759,13 @@ namespace gbe
       insn->src(elemID+1) = src[elemID];
     insn->extra.function = bti;
     insn->extra.elem = elemNum;
+    insn->doubleio = doubleio;
 
     // Sends require contiguous allocation for the sources
     vector->regNum = elemNum+1;
     vector->reg = &insn->src(0);
     vector->isSrc = 1;
+    vector->doubleio = doubleio;
   }
 
   void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
@@ -1085,6 +1093,15 @@ namespace gbe
   // Implementation of all patterns
   ///////////////////////////////////////////////////////////////////////////
 
+  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+    using namespace ir;
+    const auto &childInsn = cast<LoadImmInstruction>(insn);
+    const auto &imm = childInsn.getImmediate();
+    if(imm.type != TYPE_DOUBLE)
+      return true;
+    return false;
+  }
+
   GenRegister getRegisterFromImmediate(ir::Immediate imm)
   {
     using namespace ir;
@@ -1096,6 +1113,7 @@ namespace gbe
       case TYPE_S16: return  GenRegister::immw(imm.data.s16);
       case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
       case TYPE_S8:  return GenRegister::immw(imm.data.s8);
+      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
@@ -1218,14 +1236,14 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
         if (dag0) dag0->isRoot = 1;
       }
       // Left source cannot be immediate but it is OK if we can commute
-      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
+      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
         src0 = sel.selReg(insn.getSrc(1), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1261,7 +1279,7 @@ namespace gbe
         case OP_SHR: sel.SHR(dst, src0, src1); break;
         case OP_ASR: sel.ASR(dst, src0, src1); break;
         case OP_MUL:
-          if (type == TYPE_FLOAT)
+          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
             sel.MUL(dst, src0, src1);
           else if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
@@ -1592,6 +1610,7 @@ namespace gbe
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
+        case TYPE_DOUBLE: sel.MOV(dst, GenRegister::immdf(imm.data.f64)); break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -1639,6 +1658,7 @@ namespace gbe
   INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
     using namespace ir;
     switch (type) {
+      case TYPE_DOUBLE:
       case TYPE_FLOAT:
       case TYPE_U32:
       case TYPE_S32:
@@ -1665,9 +1685,10 @@ namespace gbe
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
       vector<GenRegister> dst(valueNum);
+      bool doubleio = insn.getValueType() == TYPE_DOUBLE ? true : false;
       for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
-      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
+      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti, doubleio);
     }
 
     void emitByteGather(Selection::Opaque &sel,
@@ -1744,11 +1765,12 @@ namespace gbe
       const uint32_t addrID = ir::StoreInstruction::addressIndex;
       GenRegister addr;
       vector<GenRegister> value(valueNum);
+      bool doubleio = insn.getValueType() == TYPE_DOUBLE ? true : false;
 
       addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);;
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
-      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti, doubleio);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
@@ -1828,7 +1850,7 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1862,7 +1884,7 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
 
       // We need two instructions to make the conversion
-      if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
+      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 778eb1f..5f7289d 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -116,6 +116,8 @@ namespace gbe
     uint8_t srcNum:5;
     /*! To store various indices */
     uint16_t index;
+    /*! Double size I/O ? */
+    bool doubleio;
     /*! Variable sized. Destinations and sources go here */
     GenRegister regs[0];
   private:
@@ -138,6 +140,8 @@ namespace gbe
     uint16_t regNum;
     /*! Indicate if this a destination or a source vector */
     uint16_t isSrc;
+    /*! "double IO" requires two more register */
+    bool doubleio;
   };
 
   // Owns the selection block
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 469be12..6b63c41 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -454,7 +454,6 @@ namespace gbe
   }
 
   bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
-
     // Perform the linear scan allocator
     const uint32_t regNum = ctx.sel->getRegNum();
     for (uint32_t startID = 0; startID < regNum; ++startID) {
@@ -472,7 +471,9 @@ namespace gbe
         const SelectionVector *vector = it->second.first;
         const uint32_t simdWidth = ctx.getSimdWidth();
         const uint32_t alignment = simdWidth * sizeof(uint32_t);
-        const uint32_t size = vector->regNum * alignment;
+        uint32_t size = vector->regNum * alignment;
+        if (vector->doubleio)
+          size += alignment;
         uint32_t grfOffset;
         while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
           const bool success = this->expireGRF(interval);
@@ -667,6 +668,11 @@ namespace gbe
     // First we try to put all booleans registers into flags
     this->allocateFlags(selection);
 
+    int w = ctx.getSimdWidth();
+    int offst = ctx.allocate(w * sizeof(int) * 2, w * sizeof(int));
+    GBE_ASSERT(offst != 0);
+    RA.insert(std::make_pair(ocl::doubleio, offst));
+
     // Allocate all the GRFs now (regular register and boolean that are not in
     // flag registers)
     return this->allocateGRFs(selection);
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d772b0d..5870b07 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -192,6 +192,7 @@ namespace gbe
 
     /*! For immediates or virtual register */
     union {
+      double df;
       float f;
       int32_t d;
       uint32_t ud;
@@ -211,6 +212,31 @@ namespace gbe
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
 
+    static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+      GenRegister r = reg;
+      r.nr += nr;
+      r.subnr += subnr;
+      return r;
+    }
+
+    INLINE bool isimmdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+        return true;
+      return false;
+    }
+
+    INLINE bool isdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    static INLINE GenRegister h2(GenRegister reg) {
+      GenRegister r = reg;
+      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
     static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
       GBE_ASSERT(reg.physical == 0);
       if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
@@ -293,6 +319,18 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
       return retype(vec16(file, reg), GEN_TYPE_UD);
     }
@@ -371,6 +409,12 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE GenRegister immdf(double df) {
+      GenRegister immediate = imm(GEN_TYPE_DF);
+      immediate.value.df = df;
+      return immediate;
+    }
+
     static INLINE GenRegister immf(float f) {
       GenRegister immediate = imm(GEN_TYPE_F);
       immediate.value.f = f;
@@ -448,6 +492,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, reg);
     }
 
+    static INLINE GenRegister df1grf(ir::Register reg) {
+      return df1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df8grf(ir::Register reg) {
+      return df8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df16grf(ir::Register reg) {
+      return df16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
     static INLINE GenRegister ud16grf(ir::Register reg) {
       return ud16(GEN_GENERAL_REGISTER_FILE, reg);
     }
@@ -613,6 +669,18 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
     }
@@ -685,6 +753,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
     static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
       return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 99cd06c..9a17f1d 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -41,6 +41,7 @@ namespace ir {
         "block_ip",
         "barrier_id", "thread_number",
         "const_curbe_offset",
+        "double_io",
     };
 
 #if GBE_DEBUG
@@ -77,6 +78,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, threadn);
       DECL_NEW_REG(FAMILY_DWORD, constoffst);
       DECL_NEW_REG(FAMILY_DWORD, workdim);
+      DECL_NEW_REG(FAMILY_DWORD, doubleio);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4b0ef5e..029b47c 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -65,7 +65,8 @@ namespace ir {
     static const Register threadn = Register(21);  // number of threads
     static const Register constoffst = Register(22); // offset of global constant array's curbe
     static const Register workdim = Register(23);  // work dimention.
-    static const uint32_t regNum = 24;             // number of special registers
+    static const Register doubleio = Register(24);  // work dimention.
+    static const uint32_t regNum = 25;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
-- 
1.8.1.2



More information about the Beignet mailing list