[Beignet] [PATCH 1/2] Support 64-bit float

Yang, Rong R rong.r.yang at intel.com
Thu Jun 20 20:24:40 PDT 2013


Hi, Homer,

 If not set OCL_SIMD_WIDTH, these tests all fail.
 If set OCL_SIMD_WIDTH=8, all pass.

My system is Ubuntu, 32bit

-----Original Message-----
From: beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org [mailto:beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org] On Behalf Of Homer Hsing
Sent: Friday, June 21, 2013 10:44 AM
To: beignet at lists.freedesktop.org
Cc: Xing, Homer
Subject: [Beignet] [PATCH 1/2] Support 64-bit float

support:
  arithmetic(+ - *)
  store load
  immediate_value
  if else
  select

not support:
  SIMD16

add "nib control" field in machine instruction format support "nib control"
fix "directly store after load". change hard coded store size (4) to flexible size (4 or 8)

example:

/* support arithmetic store load immediate_value */ kernel void f(global double *src, global double *dst) {
  int i = get_global_id(0);
  double d = 1.234567890123456789;
  dst[i] = d * (src[i] + d);
}

/* support if else */
kernel void f(global float *src, global double *dst) {
  int i = get_global_id(0);
  float d = 1.234567890123456789f;
  if (i < 14)
    dst[i] = d * (d + src[i]);
  else
    dst[i] = 14;
}

/* support select */
kernel void f(global float *src, global double *dst) {
  int i = get_global_id(0);
  float d = 1.234567890123456789f;
  dst[i] = i < 14 ? d : 14;
}

Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
 backend/src/backend/gen_context.cpp                |  17 ++
 backend/src/backend/gen_context.hpp                |   2 +
 backend/src/backend/gen_defs.hpp                   |  10 +-
 backend/src/backend/gen_encoder.cpp                | 203 ++++++++++++++++++++-
 backend/src/backend/gen_encoder.hpp                |   6 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   2 +
 backend/src/backend/gen_insn_selection.cpp         | 148 +++++++++++++--
 backend/src/backend/gen_insn_selection.hxx         |   4 +
 backend/src/backend/gen_reg_allocation.cpp         |   1 -
 backend/src/backend/gen_register.hpp               | 102 ++++++++++-
 10 files changed, 473 insertions(+), 22 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 70c5bcf..53ba73c 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -153,6 +153,8 @@ namespace gbe
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
+      case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1, src0.value.df); break;
+      case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
       case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
       case SEL_OP_AND:  p->AND(dst, src0, src1); break;
       case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
@@ -269,6 +271,14 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    const uint32_t elemNum = insn.extra.elem;
+    p->READ_FLOAT64(dst, src, bti, elemNum);  }
+
   void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0)); @@ -277,6 +287,13 @@ namespace gbe
     p->UNTYPED_READ(dst, src, bti, elemNum);
   }
 
+  void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.extra.function;
+    const uint32_t elemNum = insn.extra.elem;
+    p->WRITE_FLOAT64(src, bti, elemNum);  }
+
   void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     const uint32_t bti = insn.extra.function; diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 1566cbb..804384d 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -87,6 +87,8 @@ namespace gbe
     void emitBarrierInstruction(const SelectionInstruction &insn);
     void emitFenceInstruction(const SelectionInstruction &insn);
     void emitMathInstruction(const SelectionInstruction &insn);
+    void emitReadFloat64Instruction(const SelectionInstruction &insn);
+    void emitWriteFloat64Instruction(const SelectionInstruction &insn);
     void emitUntypedReadInstruction(const SelectionInstruction &insn);
     void emitUntypedWriteInstruction(const SelectionInstruction &insn);
     void emitByteGatherInstruction(const SelectionInstruction &insn); diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index f4e4938..9d8db5b 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -215,6 +215,7 @@ enum GenMessageTarget {  #define GEN_TYPE_VF  5 /* packed float vector, immediates only? */  #define GEN_TYPE_HF  6
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
 
 #define GEN_ARF_NULL                  0x00
@@ -303,6 +304,7 @@ enum GenMessageTarget {
 #define GEN_BYTE_SCATTER_BYTE   0
 #define GEN_BYTE_SCATTER_WORD   1
 #define GEN_BYTE_SCATTER_DWORD  2
+#define GEN_BYTE_SCATTER_QWORD  3
 
 #define GEN_SAMPLER_RETURN_FORMAT_FLOAT32     0
 #define GEN_SAMPLER_RETURN_FORMAT_UINT32      2
@@ -418,7 +420,7 @@ struct GenInstruction
       uint32_t src0_reg_type:3;
       uint32_t src1_reg_file:2;
       uint32_t src1_reg_type:3;
-      uint32_t pad:1;
+      uint32_t nib_ctrl:1;
       uint32_t dest_subreg_nr:5;
       uint32_t dest_reg_nr:8;
       uint32_t dest_horiz_stride:2;
@@ -432,7 +434,7 @@ struct GenInstruction
       uint32_t src0_reg_type:3;
       uint32_t src1_reg_file:2;        /* 0x00000c00 */
       uint32_t src1_reg_type:3;        /* 0x00007000 */
-      uint32_t pad:1;
+      uint32_t nib_ctrl:1;
       int dest_indirect_offset:10;        /* offset against the deref'd address reg */
       uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
       uint32_t dest_horiz_stride:2;
@@ -446,7 +448,7 @@ struct GenInstruction
       uint32_t src0_reg_type:3;
       uint32_t src1_reg_file:2;
       uint32_t src1_reg_type:3;
-      uint32_t pad:1;
+      uint32_t nib_ctrl:1;
       uint32_t dest_writemask:4;
       uint32_t dest_subreg_nr:1;
       uint32_t dest_reg_nr:8;
@@ -459,7 +461,7 @@ struct GenInstruction
       uint32_t dest_reg_type:3;
       uint32_t src0_reg_file:2;
       uint32_t src0_reg_type:3;
-      uint32_t pad0:6;
+      uint32_t nib_ctrl:1;
       uint32_t dest_writemask:4;
       int dest_indirect_offset:6;
       uint32_t dest_subreg_nr:3;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 859a1b9..3d8afe8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -235,6 +235,7 @@ namespace gbe
       NOT_IMPLEMENTED;
     insn->header.acc_wr_control = this->curr.accWrEnable;
     insn->header.quarter_control = this->curr.quarterControl;
+    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
     insn->header.mask_control = this->curr.noMask;
     insn->bits2.ia1.flag_reg_nr = this->curr.flag;
     insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag; @@ -355,6 +356,105 @@ namespace gbe
     0
   };
 
+  static int dst_type(int exec_width) {
+    if (exec_width == 8)
+      return GEN_TYPE_UD;
+    if (exec_width == 16)
+      return GEN_TYPE_UW;
+    NOT_IMPLEMENTED;
+    return 0;
+  }
+
+  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    int w = curr.execWidth;
+    dst = GenRegister::h2(dst);
+    dst.type = GEN_TYPE_UD;
+    src.type = GEN_TYPE_UD;
+    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
+    GenRegister imm4 = GenRegister::immud(4);
+    GenInstruction *insn;
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::uw16grf(r.nr, 0));
+    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dst, r);
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
+    pop();
+    ADD(src, src, imm4);
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::uw16grf(r.nr, 0));
+    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(GenRegister::suboffset(dst, 1), r);
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+    pop();
+  }
+
+  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    int w = curr.execWidth;
+    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
+    r.type = GEN_TYPE_UD;
+    GenRegister hdr = GenRegister::h2(r);
+    GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
+    src.hstride = GEN_HORIZONTAL_STRIDE_2;
+    GenRegister data = GenRegister::offset(r, w / 8);
+    GenRegister imm4 = GenRegister::immud(4);
+    MOV(r, GenRegister::ud8grf(msg.nr, 0));
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(data, src);
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
+    pop();
+    GenInstruction *insn;
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
+    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], 
+ GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
+
+    ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
+    push();
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(data, GenRegister::suboffset(src, 1));
+    if (w == 8)
+      curr.nibControl = 1;
+    else
+      curr.quarterControl = 1;
+    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+    pop();
+    insn = next(GEN_OPCODE_SEND);
+    setHeader(insn);
+    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
+    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
+    setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(this, insn, bti, untypedRWMask[1], 
+ GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);  }
+
   void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
     GenInstruction *insn = this->next(GEN_OPCODE_SEND);
     assert(elemNum >= 1 || elemNum <= 4); @@ -467,7 +567,25 @@ namespace gbe
   }
 
   INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
-     if (needToSplitAlu1(p, dst, src) == false) {
+     if (dst.isdf() && src.isdf()) {
+       int w = p->curr.execWidth;
+       p->push();
+       p->curr.quarterControl = 0;
+       p->curr.nibControl = 0;
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src);
+       if (w == 8)
+         p->curr.nibControl = 1; // second 1/8 mask
+       else // w == 16
+         p->curr.quarterControl = 1; // second 1/4 mask
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+       p->pop();
+     } else if (needToSplitAlu1(p, dst, src) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -499,7 +617,27 @@ namespace gbe
                    GenRegister src0,
                    GenRegister src1)
   {
-    if (needToSplitAlu2(p, dst, src0, src1) == false) {
+    if (dst.isdf() && src0.isdf() && src1.isdf()) {
+       int w = p->curr.execWidth;
+       p->push();
+       p->curr.quarterControl = 0;
+       p->curr.nibControl = 0;
+       GenInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src0);
+       p->setSrc1(insn, src1);
+       if (w == 8)
+         p->curr.nibControl = 1; // second 1/8 mask
+       else // w == 16
+         p->curr.quarterControl = 1; // second 1/4 mask
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+       p->pop();
+    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
        GenInstruction *insn = p->next(opcode);
        p->setHeader(insn);
        p->setDst(insn, dst);
@@ -620,6 +758,67 @@ namespace gbe
     alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
   }
 
+  void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+    union { double d; unsigned u[2]; } u;
+    u.d = value;
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+    push();
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[1]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    MOV(dest, r);
+    pop();
+  }
+
+  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    int w = curr.execWidth;
+    if (src0.isdf()) {
+      push();
+      curr.execWidth = 16;
+      MOV(dest, src0);
+      if (w == 16) {
+        curr.quarterControl = 1;
+        MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
+      }
+      pop();
+    } else {
+      GenRegister r0 = GenRegister::h2(r);
+      push();
+      curr.execWidth = 8;
+      curr.predicate = GEN_PREDICATE_NONE;
+      MOV(r0, src0);
+      MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
+      curr.predicate = GEN_PREDICATE_NORMAL;
+      curr.quarterControl = 0;
+      curr.nibControl = 0;
+      MOV(dest, r);
+      curr.nibControl = 1;
+      MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8));
+      pop();
+      if (w == 16) {
+        push();
+        curr.execWidth = 8;
+        curr.predicate = GEN_PREDICATE_NONE;
+        MOV(r0, GenRegister::suboffset(src0, 8));
+        MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
+        curr.predicate = GEN_PREDICATE_NORMAL;
+        curr.quarterControl = 1;
+        curr.nibControl = 0;
+        MOV(GenRegister::suboffset(dest, 8), r);
+        curr.nibControl = 1;
+        MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8));
+        pop();
+      }
+    }
+  }
+
   ALU1(MOV)
   ALU1(RNDZ)
   ALU1(RNDE)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index c98774f..1a5dcf9 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -113,9 +113,11 @@ namespace gbe
     ALU2(LINE)
     ALU2(PLN)
     ALU3(MAD)
+    ALU2(MOV_DF);
 #undef ALU1
 #undef ALU2
 #undef ALU3
+    void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     /*! Barrier message (to synchronize threads of a workgroup) */
     void BARRIER(GenRegister src);
     /*! Memory fence message (to order loads and stores between threads) */ @@ -132,6 +134,10 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Read 64-bits float arrays */
+    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Write 64-bits float arrays */
+    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t 
+ elemNum);
     /*! Untyped read (upto 4 channels) */
     void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
     /*! Untyped write (upto 4 channels) */ diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 098d9ec..a3b4621 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
 DECL_GEN7_SCHEDULE(Math,            20,        4,        2)
 DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
 DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
+DECL_GEN7_SCHEDULE(ReadFloat64,     80,        1,        1)
+DECL_GEN7_SCHEDULE(WriteFloat64,    80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
 DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
 DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 4e7cebd..5901419 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -129,6 +129,7 @@ namespace gbe
       case TYPE_S32: return GEN_TYPE_D;
       case TYPE_U32: return GEN_TYPE_UD;
       case TYPE_FLOAT: return GEN_TYPE_F;
+      case TYPE_DOUBLE: return GEN_TYPE_DF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
     }
   }
@@ -166,11 +167,13 @@ namespace gbe
 
   bool SelectionInstruction::isRead(void) const {
     return this->opcode == SEL_OP_UNTYPED_READ ||
+           this->opcode == SEL_OP_READ_FLOAT64 ||
            this->opcode == SEL_OP_BYTE_GATHER;
   }
 
   bool SelectionInstruction::isWrite(void) const {
     return this->opcode == SEL_OP_UNTYPED_WRITE ||
+           this->opcode == SEL_OP_WRITE_FLOAT64 ||
            this->opcode == SEL_OP_BYTE_SCATTER;
   }
 
@@ -406,6 +409,8 @@ namespace gbe
 #define ALU3(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
     ALU1(MOV)
+    ALU2(MOV_DF)
+    ALU2(LOAD_DF_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU2(SEL)
@@ -449,6 +454,10 @@ namespace gbe
     void NOP(void);
     /*! Wait instruction (used for the barrier) */
     void WAIT(void);
+    /*! Read 64 bits float array */
+    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    /*! Write 64 bits float array */
+    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t 
+ elemNum, uint32_t bti);
     /*! Untyped read (up to 4 elements) */
     void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
     /*! Untyped write (up to 4 elements) */ @@ -610,20 +619,23 @@ namespace gbe
 
   ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
     SelectionBlock *block = insn->parent;
-    const uint32_t simdWidth = ctx.getSimdWidth();
+    uint32_t simdWidth = ctx.getSimdWidth();
     ir::Register tmp;
+    ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
+    int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
+    GenRegister gr;
 
     // This will append the temporary register in the instruction block
     this->block = block;
-    tmp = this->reg(ir::FAMILY_DWORD);
+    tmp = this->reg(f);
 
     // Generate the MOV instruction and replace the register in the instruction
     SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
-    mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F);
+    mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
     mov->state = GenInstructionState(simdWidth);
-    insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp);
+    gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
+    insn->dst(regID) = mov->src(0) = gr;
     insn->append(*mov);
-
     return tmp;
   }
 
@@ -657,6 +669,7 @@ namespace gbe
       case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
       case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
       case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
       default: NOT_SUPPORTED;
     }
     GBE_ASSERT(false);
@@ -719,6 +732,33 @@ namespace gbe
   void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
   void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
 
+  void Selection::Opaque::READ_FLOAT64(Reg addr,
+                                       const GenRegister *dst,
+                                       uint32_t elemNum,
+                                       uint32_t bti)  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+
+    // Regular instruction to encode
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    insn->src(0) = addr;
+    insn->extra.function = bti;
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation
+    dstVector->regNum = elemNum;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+
+    // Source cannot be scalar (yet)
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
   void Selection::Opaque::UNTYPED_READ(Reg addr,
                                        const GenRegister *dst,
                                        uint32_t elemNum, @@ -746,6 +786,27 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
+  void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+                                        const GenRegister *src,
+                                        uint32_t elemNum,
+                                        uint32_t bti)  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+    SelectionVector *vector = this->appendVector();
+
+    // Regular instruction to encode
+    insn->src(0) = addr;
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->src(elemID+1) = src[elemID];
+    insn->extra.function = bti;
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation for the sources
+    vector->regNum = elemNum+1;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
   void Selection::Opaque::UNTYPED_WRITE(Reg addr,
                                         const GenRegister *src,
                                         uint32_t elemNum, @@ -1092,6 +1153,15 @@ namespace gbe
   // Implementation of all patterns
   ///////////////////////////////////////////////////////////////////////////
 
+  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+    using namespace ir;
+    const auto &childInsn = cast<LoadImmInstruction>(insn);
+    const auto &imm = childInsn.getImmediate();
+    if(imm.type != TYPE_DOUBLE)
+      return true;
+    return false;
+  }
+
   GenRegister getRegisterFromImmediate(ir::Immediate imm)
   {
     using namespace ir;
@@ -1103,6 +1173,7 @@ namespace gbe
       case TYPE_S16: return  GenRegister::immw(imm.data.s16);
       case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
       case TYPE_S8:  return GenRegister::immw(imm.data.s8);
+      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
@@ -1146,7 +1217,13 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0));
       switch (opcode) {
         case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
-        case ir::OP_MOV: sel.MOV(dst, src); break;
+        case ir::OP_MOV:
+          if (dst.isdf()) {
+            ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+            sel.MOV_DF(dst, src, sel.selReg(r));
+          } else
+            sel.MOV(dst, src);
+          break;
         case ir::OP_RNDD: sel.RNDD(dst, src); break;
         case ir::OP_RNDE: sel.RNDE(dst, src); break;
         case ir::OP_RNDU: sel.RNDU(dst, src); break; @@ -1225,14 +1302,14 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && 
+ dag1->insn.getOpcode() == OP_LOADI && 
+ canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
         if (dag0) dag0->isRoot = 1;
       }
       // Left source cannot be immediate but it is OK if we can commute
-      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
+      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && 
+ insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && 
+ canGetRegisterFromImmediate(dag0->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
         src0 = sel.selReg(insn.getSrc(1), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1268,7 +1345,7 @@ namespace gbe
         case OP_SHR: sel.SHR(dst, src0, src1); break;
         case OP_ASR: sel.ASR(dst, src0, src1); break;
         case OP_MUL:
-          if (type == TYPE_FLOAT)
+          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
             sel.MUL(dst, src0, src1);
           else if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
@@ -1599,6 +1676,7 @@ namespace gbe
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
+        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, 
+ GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD))); 
+ break;
         default: NOT_SUPPORTED;
       }
       sel.pop();
@@ -1650,6 +1728,8 @@ namespace gbe
   INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
     using namespace ir;
     switch (type) {
+      case TYPE_DOUBLE:
+        return GEN_BYTE_SCATTER_QWORD;
       case TYPE_FLOAT:
       case TYPE_U32:
       case TYPE_S32:
@@ -1681,6 +1761,22 @@ namespace gbe
       sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
     }
 
+    void emitReadFloat64(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      vector<GenRegister> dst(valueNum);
+      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
+      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      if (sel.ctx.getSimdWidth() == 16)
+        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+    }
+
     void emitByteGather(Selection::Opaque &sel,
                         const ir::LoadInstruction &insn,
                         const uint32_t elemSize, @@ -1732,6 +1828,8 @@ namespace gbe
       const uint32_t elemSize = getByteScatterGatherSize(type);
       if (insn.getAddressSpace() == MEM_CONSTANT)
         this->emitIndirectMove(sel, insn, address);
+      else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 
+ 0xfe : 0x00);
       else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
       else {
@@ -1762,6 +1860,25 @@ namespace gbe
       sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
     }
 
+    void emitWriteFloat64(Selection::Opaque &sel,
+                          const ir::StoreInstruction &insn,
+                          uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t addrID = ir::StoreInstruction::addressIndex;
+      GenRegister addr;
+      vector<GenRegister> value(valueNum);
+
+      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      if (sel.ctx.getSimdWidth() == 16)
+        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+    }
+
     void emitByteScatter(Selection::Opaque &sel,
                          const ir::StoreInstruction &insn,
                          const uint32_t elemSize, @@ -1791,7 +1908,9 @@ namespace gbe
       const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
-      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+      if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+        this->emitWriteFloat64(sel, insn, bti);
+      else if (insn.isAligned() == true && elemSize == 
+ GEN_BYTE_SCATTER_DWORD)
         this->emitUntypedWrite(sel, insn, bti);
       else {
         const GenRegister address = sel.selReg(insn.getAddress()); @@ -1839,7 +1958,7 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && 
+ dag1->insn.getOpcode() == OP_LOADI && 
+ canGetRegisterFromImmediate(dag1->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1873,7 +1992,7 @@ namespace gbe
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
 
       // We need two instructions to make the conversion
-      if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
+      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && 
+ srcFamily == FAMILY_DWORD) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; @@ -1886,6 +2005,9 @@ namespace gbe
         }
         sel.MOV(unpacked, src);
         sel.MOV(dst, unpacked);
+      } else if (dst.isdf()) {
+        ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+        sel.MOV_DF(dst, src, sel.selReg(r));
       } else
         sel.MOV(dst, src);
       return true;
@@ -1919,7 +2041,7 @@ namespace gbe
       SelectionDAG *dag2 = dag.child[2];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && dag2->insn.getOpcode() == OP_LOADI) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL && 
+ dag2->insn.getOpcode() == OP_LOADI && 
+ canGetRegisterFromImmediate(dag2->insn)) {
         const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
         src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 789c81c..4b5525b 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,7 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)  DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, BinaryInstruction) 
+DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
 DECL_SELECTION_IR(NOT, UnaryInstruction)  DECL_SELECTION_IR(LZD, UnaryInstruction)  DECL_SELECTION_IR(RNDZ, UnaryInstruction) @@ -32,6 +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)  DECL_SELECTION_IR(FENCE, FenceInstruction)  DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)  DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction) 
+DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
 DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)  DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)  DECL_SELECTION_IR(SAMPLE, SampleInstruction) diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 9765b02..e7c96ac 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -458,7 +458,6 @@ namespace gbe
   }
 
   bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
-
     // Perform the linear scan allocator
     const uint32_t regNum = ctx.sel->getRegNum();
     for (uint32_t startID = 0; startID < regNum; ++startID) { diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d772b0d..fedb743 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -69,11 +69,12 @@ namespace gbe
   /*! Type size in bytes for each Gen type */
   INLINE int typeSize(uint32_t type) {
     switch(type) {
+      case GEN_TYPE_DF:
+        return 8;
       case GEN_TYPE_UD:
       case GEN_TYPE_D:
       case GEN_TYPE_F:
         return 4;
-      case GEN_TYPE_HF:
       case GEN_TYPE_UW:
       case GEN_TYPE_W:
         return 2;
@@ -110,6 +111,7 @@ namespace gbe
     INLINE GenInstructionState(uint32_t simdWidth = 8) {
       this->execWidth = simdWidth;
       this->quarterControl = GEN_COMPRESSION_Q1;
+      this->nibControl = 0;
       this->accWrEnable = 0;
       this->noMask = 0;
       this->flag = 0;
@@ -126,6 +128,7 @@ namespace gbe
     uint32_t flagIndex:16;   //!< Only if virtual flag (index of the register)
     uint32_t execWidth:5;
     uint32_t quarterControl:1;
+    uint32_t nibControl:1;
     uint32_t accWrEnable:1;
     uint32_t noMask:1;
     uint32_t predicate:4;
@@ -192,6 +195,7 @@ namespace gbe
 
     /*! For immediates or virtual register */
     union {
+      double df;
       float f;
       int32_t d;
       uint32_t ud;
@@ -211,6 +215,31 @@ namespace gbe
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
 
+    static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+      GenRegister r = reg;
+      r.nr += nr;
+      r.subnr += subnr;
+      return r;
+    }
+
+    INLINE bool isimmdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+        return true;
+      return false;
+    }
+
+    INLINE bool isdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    static INLINE GenRegister h2(GenRegister reg) {
+      GenRegister r = reg;
+      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
     static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
       GBE_ASSERT(reg.physical == 0);
       if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register @@ -293,6 +322,18 @@ namespace gbe
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
       return retype(vec16(file, reg), GEN_TYPE_UD);
     }
@@ -371,6 +412,12 @@ namespace gbe
                          GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE GenRegister immdf(double df) {
+      GenRegister immediate = imm(GEN_TYPE_DF);
+      immediate.value.df = df;
+      return immediate;
+    }
+
     static INLINE GenRegister immf(float f) {
       GenRegister immediate = imm(GEN_TYPE_F);
       immediate.value.f = f;
@@ -448,6 +495,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, reg);
     }
 
+    static INLINE GenRegister df1grf(ir::Register reg) {
+      return df1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df8grf(ir::Register reg) {
+      return df8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df16grf(ir::Register reg) {
+      return df16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
     static INLINE GenRegister ud16grf(ir::Register reg) {
       return ud16(GEN_GENERAL_REGISTER_FILE, reg);
     }
@@ -608,11 +667,37 @@ namespace gbe
                     GEN_HORIZONTAL_STRIDE_0);
     }
 
+    static INLINE int hstride_size(GenRegister reg) {
+      switch (reg.hstride) {
+        case GEN_HORIZONTAL_STRIDE_0: return 0;
+        case GEN_HORIZONTAL_STRIDE_1: return 1;
+        case GEN_HORIZONTAL_STRIDE_2: return 2;
+        case GEN_HORIZONTAL_STRIDE_4: return 4;
+        default: NOT_IMPLEMENTED; return 0;
+      }
+    }
+
     static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
-      reg.subnr += delta * typeSize(reg.type);
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += delta * typeSize(reg.type);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      }
       return reg;
     }
 
+    static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+    }
+
     static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
     }
@@ -685,6 +770,18 @@ namespace gbe
       return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
 
+    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
     static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
       return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
     }
@@ -790,6 +887,7 @@ namespace gbe
         return SIMD1(values...); \
       } \
     }
+    DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
     DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
     DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
     DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
--
1.8.1.2

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list