[Beignet] [PATCH v6 1/2] Support 64-bit float

Homer Hsing homer.xing at intel.com
Tue Jun 18 21:40:35 PDT 2013


support arithmetic, store, load, and 64-bit float immediate

example:

  kernel void f(global double *src, global double *dst) {
    int i = get_global_id(0);
    double d = 1.234567890123456789;
    dst[i] = d * (src[i] + d);
  }

Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
 backend/src/backend/gen_context.cpp                |  17 +++
 backend/src/backend/gen_context.hpp                |   2 +
 backend/src/backend/gen_defs.hpp                   |   2 +
 backend/src/backend/gen_encoder.cpp                | 140 +++++++++++++++++++-
 backend/src/backend/gen_encoder.hpp                |   6 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   2 +
 backend/src/backend/gen_insn_selection.cpp         | 146 +++++++++++++++++++--
 backend/src/backend/gen_insn_selection.hxx         |   4 +
 backend/src/backend/gen_reg_allocation.cpp         |   1 -
 backend/src/backend/gen_register.hpp               |  90 ++++++++++++-
 10 files changed, 393 insertions(+), 17 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index af651e7..91ae72b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -139,6 +139,7 @@ namespace gbe
     const GenRegister src = ra->genReg(insn.src(0));
     switch (insn.opcode) {
       case SEL_OP_MOV: p->MOV(dst, src); break;
+      case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src); break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
       case SEL_OP_RNDD: p->RNDD(dst, src); break;
       case SEL_OP_RNDU: p->RNDU(dst, src); break;
@@ -153,6 +154,7 @@ namespace gbe
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
+      case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
       case SEL_OP_AND:  p->AND(dst, src0, src1); break;
       case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
@@ -264,6 +266,14 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    const uint32_t elemNum = insn.extra.elem;
+    p->READ_FLOAT64(dst, src, bti, elemNum);
+  }
+
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
@@ -272,6 +282,13 @@ namespace gbe
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
+  void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    const uint32_t elemNum = insn.extra.elem;
+    p->WRITE_FLOAT64(src, bti, elemNum);
+  }
+
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     const uint32_t bti = insn.extra.function;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 1566cbb..804384d 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -87,6 +87,8 @@ namespace gbe
     void emitBarrierInstruction(const SelectionInstruction &insn);
     void emitFenceInstruction(const SelectionInstruction &insn);
     void emitMathInstruction(const SelectionInstruction &insn);
+    void emitReadFloat64Instruction(const SelectionInstruction &insn);
+    void emitWriteFloat64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index f4e4938..0bfd66c 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -215,6 +215,7 @@ enum GenMessageTarget {
 #define GEN_TYPE_VF  5 /* packed float vector, immediates only? */
 #define GEN_TYPE_HF  6
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
 
 #define GEN_ARF_NULL                  0x00
@@ -303,6 +304,7 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER_BYTE   0
 #define GEN_BYTE_SCATTER_WORD   1
 #define GEN_BYTE_SCATTER_DWORD  2
+#define GEN_BYTE_SCATTER_QWORD  3
 
 #define GEN_SAMPLER_RETURN_FORMAT_FLOAT32     0
 #define GEN_SAMPLER_RETURN_FORMAT_UINT32      2
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 859a1b9..d5bf001 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -355,6 +355,64 @@ namespace gbe
     0
   };
 
+  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    int w = curr.execWidth;
+    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
+    GenRegister hdr = GenRegister::h2(r);
+    GenRegister imm4 = GenRegister::immud(4);
+    push();
+      curr.execWidth = 8;
+      MOV(hdr,                            GenRegister::ud8grf(src.nr, 0));
+      ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+      if (w == 16) {
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(src.nr, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      }
+    pop();
+    UNTYPED_READ(dst, hdr, bti, 1);
+    push();
+      curr.execWidth = 8;
+      MOV(hdr, w == 16 ? GenRegister::ud8grf(src.nr+1, 0) : GenRegister::retype(GenRegister::offset(src, 0, 16), GEN_TYPE_UD));
+      ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+      if (w == 16) {
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(src.nr + 1, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      }
+    pop();
+    UNTYPED_READ(GenRegister::offset(dst, w / 8), hdr, bti, 1);
+  }
+
+  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    int w = curr.execWidth;
+    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
+    r.type = GEN_TYPE_UD;
+    GenRegister hdr = GenRegister::h2(r);
+    GenRegister data = GenRegister::offset(r, w / 8);
+    GenRegister imm4 = GenRegister::immud(4);
+    push();
+      curr.execWidth = 8;
+      MOV(hdr,                            GenRegister::ud8grf(msg.nr, 0));
+      ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+      if (w == 16) {
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(msg.nr, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      }
+    pop();
+    MOV(data, GenRegister::ud16grf(msg.nr + w / 8, 0));
+    UNTYPED_WRITE(hdr, bti, 1);
+    push();
+      curr.execWidth = 8;
+      MOV(hdr, w == 16 ? GenRegister::ud8grf(msg.nr+1, 0) : GenRegister::retype(GenRegister::offset(msg, 0, 16), GEN_TYPE_UD));
+      ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+      if (w == 16) {
+        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(msg.nr+1, 4));
+        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+      }
+    pop();
+    MOV(data, GenRegister::ud16grf(msg.nr + w / 4, 0));
+    UNTYPED_WRITE(hdr, bti, 1);
+  }
+
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4);
@@ -467,7 +525,19 @@ namespace gbe
   }
 
   INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
-     if (needToSplitAlu1(p, dst, src) == false) {
+     if (dst.isdf() && src.isdf()) {
+       int w = p->curr.execWidth;
+       p->push();
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src);
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+       p->pop();
+     } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -499,7 +569,21 @@ namespace gbe
                    GenRegister src0,
                    GenRegister src1)
   {
-    if (needToSplitAlu2(p, dst, src0, src1) == false) {
+    if (dst.isdf() && src0.isdf() && src1.isdf()) {
+       int w = p->curr.execWidth;
+       p->push();
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src0);
+       p->setSrc1(insn, src1);
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+       p->pop();
+    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -620,6 +704,58 @@ namespace gbe
     alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
   }
 
+  void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister src0) {
+    union { double d; unsigned u[2]; } u;
+    u.d = src0.value.df;
+    GenRegister r = GenRegister::retype(dest, GEN_TYPE_UD);
+    push();
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[1]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    MOV(dest, r);
+    pop();
+  }
+
+  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    int w = curr.execWidth;
+    if (src0.isdf()) {
+      push();
+      curr.execWidth = 16;
+      MOV(dest, src0);
+      if (w == 16)
+        MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
+      pop();
+    } else {
+      GenRegister r0 = GenRegister::h2(r);
+      push();
+      curr.execWidth = 8;
+      MOV(r0, src0);
+      MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
+      pop();
+      push();
+      curr.execWidth = 16;
+      MOV(dest, r);
+      pop();
+      if (w == 16) {
+        push();
+        curr.execWidth = 8;
+        MOV(r0, GenRegister::suboffset(src0, 8));
+        MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
+        pop();
+        push();
+        curr.execWidth = 16;
+        MOV(GenRegister::suboffset(dest, 8), r);
+        pop();
+      }
+    }
+  }
+
   ALU1(MOV)
   ALU1(RNDZ)
   ALU1(RNDE)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index c98774f..5e5c9e1 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -113,6 +113,8 @@ namespace gbe
     ALU2(LINE)
     ALU2(PLN)
     ALU3(MAD)
+    ALU1(LOAD_DF_IMM);
+    ALU2(MOV_DF);
 #undef ALU1
 #undef ALU2
 #undef ALU3
@@ -132,6 +134,10 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Read 64-bits float arrays */
+    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Write 64-bits float arrays */
+    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped read (upto 4 channels) */
     void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 098d9ec..a3b4621 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
 DECL_GEN7_SCHEDULE(Math,            20,        4,        2)
 DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
+DECL_GEN7_SCHEDULE(ReadFloat64,     80,        1,        1)
+DECL_GEN7_SCHEDULE(WriteFloat64,    80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 4e7cebd..0bf9754 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -129,6 +129,7 @@ namespace gbe
       case TYPE_S32: return GEN_TYPE_D;
       case TYPE_U32: return GEN_TYPE_UD;
       case TYPE_FLOAT: return GEN_TYPE_F;
+      case TYPE_DOUBLE: return GEN_TYPE_DF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
     }
   }
@@ -166,11 +167,13 @@ namespace gbe
 
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
+           this->opcode == SEL_OP_READ_FLOAT64 ||
            this->opcode == SEL_OP_BYTE_GATHER;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
+           this->opcode == SEL_OP_WRITE_FLOAT64 ||
            this->opcode == SEL_OP_BYTE_SCATTER;
   }
 
@@ -406,6 +409,8 @@ namespace gbe
 #define ALU3(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
     ALU1(MOV)
+    ALU2(MOV_DF)
+    ALU1(LOAD_DF_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU2(SEL)
@@ -449,6 +454,10 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Read 64 bits float array */
+    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    /*! Write 64 bits float array */
+    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */
@@ -610,20 +619,23 @@ namespace gbe
 
   ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
     SelectionBlock *block = insn->parent;
-    const uint32_t simdWidth = ctx.getSimdWidth();
+    uint32_t simdWidth = ctx.getSimdWidth();
     ir::Register tmp;
+    ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
+    int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
+    GenRegister gr;
 
     // This will append the temporary register in the instruction block
     this->block = block;
-    tmp = this->reg(ir::FAMILY_DWORD);
+    tmp = this->reg(f);
 
     // Generate the MOV instruction and replace the register in the instruction
     SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
-    mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F);
+    mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
     mov->state = GenInstructionState(simdWidth);
-    insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp);
+    gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
+    insn->dst(regID) = mov->src(0) = gr;
     insn->append(*mov);
-
     return tmp;
   }
 
@@ -657,6 +669,7 @@ namespace gbe
       case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
       default: NOT_SUPPORTED;
     }
     GBE_ASSERT(false);
@@ -719,6 +732,33 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
+  void Selection::Opaque::READ_FLOAT64(Reg addr,
+                                       const GenRegister *dst,
+                                       uint32_t elemNum,
+                                       uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+
+    // Regular instruction to encode
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    insn->src(0) = addr;
+    insn->extra.function = bti;
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation
+    dstVector->regNum = elemNum;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+
+    // Source cannot be scalar (yet)
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum,
@@ -746,6 +786,27 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
+  void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+                                        const GenRegister *src,
+                                        uint32_t elemNum,
+                                        uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+    SelectionVector *vector = this->appendVector();
+
+    // Regular instruction to encode
+    insn->src(0) = addr;
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->src(elemID+1) = src[elemID];
+    insn->extra.function = bti;
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation for the sources
+    vector->regNum = elemNum+1;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum,
@@ -1092,6 +1153,15 @@ namespace gbe
   // Implementation of all patterns
   ///////////////////////////////////////////////////////////////////////////
 
+  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+    using namespace ir;
+    const auto &childInsn = cast<LoadImmInstruction>(insn);
+    const auto &imm = childInsn.getImmediate();
+    if(imm.type != TYPE_DOUBLE)
+      return true;
+    return false;
+  }
+
   GenRegister getRegisterFromImmediate(ir::Immediate imm)
   {
     using namespace ir;
@@ -1103,6 +1173,7 @@ namespace gbe
       case TYPE_S16: return  GenRegister::immw(imm.data.s16);
       case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
       case TYPE_S8:  return GenRegister::immw(imm.data.s8);
+      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
@@ -1146,7 +1217,13 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0));
       switch (opcode) {
         case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
-        case ir::OP_MOV: sel.MOV(dst, src); break;
+        case ir::OP_MOV:
+          if (dst.isdf()) {
+            ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+            sel.MOV_DF(dst, src, sel.selReg(r));
+          } else
+            sel.MOV(dst, src);
+          break;
         case ir::OP_RNDD: sel.RNDD(dst, src); break;
         case ir::OP_RNDE: sel.RNDE(dst, src); break;
         case ir::OP_RNDU: sel.RNDU(dst, src); break;
@@ -1225,14 +1302,14 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
         if (dag0) dag0->isRoot = 1;
       }
       // Left source cannot be immediate but it is OK if we can commute
-      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
+      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
         src0 = sel.selReg(insn.getSrc(1), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1268,7 +1345,7 @@ namespace gbe
         case OP_SHR: sel.SHR(dst, src0, src1); break;
         case OP_ASR: sel.ASR(dst, src0, src1); break;
         case OP_MUL:
-          if (type == TYPE_FLOAT)
+          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
             sel.MUL(dst, src0, src1);
           else if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
@@ -1599,6 +1676,7 @@ namespace gbe
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
+        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64)); break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -1650,6 +1728,8 @@ namespace gbe
   INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
     using namespace ir;
     switch (type) {
+      case TYPE_DOUBLE:
+        return GEN_BYTE_SCATTER_QWORD;
       case TYPE_FLOAT:
       case TYPE_U32:
       case TYPE_S32:
@@ -1681,6 +1761,22 @@ namespace gbe
       sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
     }
 
+    void emitReadFloat64(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      vector<GenRegister> dst(valueNum);
+      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
+      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      if (sel.ctx.getSimdWidth() == 16)
+        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+    }
+
     void emitByteGather(Selection::Opaque &sel,
                         const ir::LoadInstruction &insn,
                         const uint32_t elemSize,
@@ -1732,6 +1828,8 @@ namespace gbe
       const uint32_t elemSize = getByteScatterGatherSize(type);
       if (insn.getAddressSpace() == MEM_CONSTANT)
         this->emitIndirectMove(sel, insn, address);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
@@ -1762,6 +1860,25 @@ namespace gbe
       sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
     }
 
+    void emitWriteFloat64(Selection::Opaque &sel,
+                          const ir::StoreInstruction &insn,
+                          uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t addrID = ir::StoreInstruction::addressIndex;
+      GenRegister addr;
+      vector<GenRegister> value(valueNum);
+
+      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      if (sel.ctx.getSimdWidth() == 16)
+        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+    }
+
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize,
@@ -1791,7 +1908,9 @@ namespace gbe
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitWriteFloat64(sel, insn, bti);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, bti);
       else {
         const GenRegister address = sel.selReg(insn.getAddress());
@@ -1839,7 +1958,7 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1873,7 +1992,7 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
 
       // We need two instructions to make the conversion
-      if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
+      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -1886,6 +2005,9 @@ namespace gbe
         }
         sel.MOV(unpacked, src);
         sel.MOV(dst, unpacked);
+      } else if (dst.isdf()) {
+        ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+        sel.MOV_DF(dst, src, sel.selReg(r));
       } else
         sel.MOV(dst, src);
       return true;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 789c81c..167af09 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,7 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, UnaryInstruction)
 DECL_SELECTION_IR(NOT, UnaryInstruction)
 DECL_SELECTION_IR(LZD, UnaryInstruction)
 DECL_SELECTION_IR(RNDZ, UnaryInstruction)
@@ -32,6 +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
 DECL_SELECTION_IR(FENCE, FenceInstruction)
 DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
 DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
+DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
 DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 9765b02..e7c96ac 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -458,7 +458,6 @@ namespace gbe
   }
 
   bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
-
     // Perform the linear scan allocator
     const uint32_t regNum = ctx.sel->getRegNum();
     for (uint32_t startID = 0; startID < regNum; ++startID) {
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d772b0d..ef0a2e4 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -69,11 +69,12 @@ namespace gbe
   /*! Type size in bytes for each Gen type */
   INLINE int typeSize(uint32_t type) {
     switch(type) {
+      case GEN_TYPE_DF:
+        return 8;
       case GEN_TYPE_UD:
       case GEN_TYPE_D:
       case GEN_TYPE_F:
         return 4;
-      case GEN_TYPE_HF:
       case GEN_TYPE_UW:
       case GEN_TYPE_W:
         return 2;
@@ -192,6 +193,7 @@ namespace gbe
 
     /*! For immediates or virtual register */
     union {
+      double df;
       float f;
       int32_t d;
       uint32_t ud;
@@ -211,6 +213,31 @@ namespace gbe
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
 
+    static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+      GenRegister r = reg;
+      r.nr += nr;
+      r.subnr += subnr;
+      return r;
+    }
+
+    INLINE bool isimmdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+        return true;
+      return false;
+    }
+
+    INLINE bool isdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    static INLINE GenRegister h2(GenRegister reg) {
+      GenRegister r = reg;
+      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
     static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
       GBE_ASSERT(reg.physical == 0);
       if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
@@ -293,6 +320,18 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
       return retype(vec16(file, reg), GEN_TYPE_UD);
     }
@@ -371,6 +410,12 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE GenRegister immdf(double df) {
+      GenRegister immediate = imm(GEN_TYPE_DF);
+      immediate.value.df = df;
+      return immediate;
+    }
+
     static INLINE GenRegister immf(float f) {
       GenRegister immediate = imm(GEN_TYPE_F);
       immediate.value.f = f;
@@ -448,6 +493,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, reg);
     }
 
+    static INLINE GenRegister df1grf(ir::Register reg) {
+      return df1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df8grf(ir::Register reg) {
+      return df8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df16grf(ir::Register reg) {
+      return df16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
     static INLINE GenRegister ud16grf(ir::Register reg) {
       return ud16(GEN_GENERAL_REGISTER_FILE, reg);
     }
@@ -609,10 +666,26 @@ namespace gbe
     }
 
     static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
-      reg.subnr += delta * typeSize(reg.type);
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += delta * typeSize(reg.type);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      }
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
     }
@@ -685,6 +758,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
     static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
       return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
@@ -790,6 +875,7 @@ namespace gbe
         return SIMD1(values...); \
       } \
     }
+    DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
     DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
     DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
     DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
-- 
1.8.1.2



More information about the Beignet mailing list