[Beignet] [PATCH 1/2] Support 64-bit float

Zhigang Gong zhigang.gong at linux.intel.com
Thu Jun 20 21:04:39 PDT 2013


Rong,

Thanks for the test report.

Homer,

This is a really good progress. And from my point of view, to support SIMD16
could be next step goal with lower priority.
But at this step, we still need to let it work with default environment
which means the OCL_SIMD_WIDTH is unset.
The best solution is to force the OCL_SIMD_WIDTH to 8 at the IR translation
stage at proper place.

For example, if the only instruction which doesn't support SIMD16 is
LoadStore instruction. Then you can do something as below:

+  extern int OCL_SIMD_WIDTH;
  template <bool isLoad, typename T>
  INLINE void GenWriter::emitLoadOrStore(T &I)
  {
    unsigned int llvmSpace = I.getPointerAddressSpace();
    Value *llvmPtr = I.getPointerOperand();
    Value *llvmValues = getLoadOrStoreValue(I);
    Type *llvmType = llvmValues->getType();
    const bool dwAligned = (I.getAlignment() % 4) == 0;
    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
    const ir::Register ptr = this->getRegister(llvmPtr);

    // Scalar is easy. We neednot build register tuples
    if (isScalarType(llvmType) == true) {
      const ir::Type type = getType(ctx, llvmType);
+      if (type == ir::TYPE_DOUBLE)
+        OCL_SIMD_WIDTH=8;
      const ir::Register values = this->getRegister(llvmValues);
      if (isLoad)
        ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
      else
        ctx.STORE(type, ptr, addrSpace, dwAligned, values);
    }

If there are also other instructions doesn't support SIMD16, then you need
to do the same thing on all of them.

> -----Original Message-----
> From: beignet-bounces+zhigang.gong=linux.intel.com at lists.freedesktop.org
>
[mailto:beignet-bounces+zhigang.gong=linux.intel.com at lists.freedesktop.org]
> On Behalf Of Yang, Rong R
> Sent: Friday, June 21, 2013 11:25 AM
> To: Xing, Homer; beignet at lists.freedesktop.org
> Cc: Xing, Homer
> Subject: Re: [Beignet] [PATCH 1/2] Support 64-bit float
> 
> Hi, Homer,
> 
>  If not set OCL_SIMD_WIDTH, these tests all fail.
>  If set OCL_SIMD_WIDTH=8, all pass.
> 
> My system is Ubuntu, 32bit
> 
> -----Original Message-----
> From: beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org
> [mailto:beignet-bounces+rong.r.yang=intel.com at lists.freedesktop.org] On
> Behalf Of Homer Hsing
> Sent: Friday, June 21, 2013 10:44 AM
> To: beignet at lists.freedesktop.org
> Cc: Xing, Homer
> Subject: [Beignet] [PATCH 1/2] Support 64-bit float
> 
> support:
>   arithmetic(+ - *)
>   store load
>   immediate_value
>   if else
>   select
> 
> not support:
>   SIMD16
> 
> add "nib control" field in machine instruction format support "nib
control"
> fix "directly store after load". change hard coded store size (4) to
flexible size (4
> or 8)
> 
> example:
> 
> /* support arithmetic store load immediate_value */ kernel void f(global
double
> *src, global double *dst) {
>   int i = get_global_id(0);
>   double d = 1.234567890123456789;
>   dst[i] = d * (src[i] + d);
> }
> 
> /* support if else */
> kernel void f(global float *src, global double *dst) {
>   int i = get_global_id(0);
>   float d = 1.234567890123456789f;
>   if (i < 14)
>     dst[i] = d * (d + src[i]);
>   else
>     dst[i] = 14;
> }
> 
> /* support select */
> kernel void f(global float *src, global double *dst) {
>   int i = get_global_id(0);
>   float d = 1.234567890123456789f;
>   dst[i] = i < 14 ? d : 14;
> }
> 
> Signed-off-by: Homer Hsing <homer.xing at intel.com>
> ---
>  backend/src/backend/gen_context.cpp                |  17 ++
>  backend/src/backend/gen_context.hpp                |   2 +
>  backend/src/backend/gen_defs.hpp                   |  10 +-
>  backend/src/backend/gen_encoder.cpp                | 203
> ++++++++++++++++++++-
>  backend/src/backend/gen_encoder.hpp                |   6 +
>  .../src/backend/gen_insn_gen7_schedule_info.hxx    |   2 +
>  backend/src/backend/gen_insn_selection.cpp         | 148
> +++++++++++++--
>  backend/src/backend/gen_insn_selection.hxx         |   4 +
>  backend/src/backend/gen_reg_allocation.cpp         |   1 -
>  backend/src/backend/gen_register.hpp               | 102 ++++++++++-
>  10 files changed, 473 insertions(+), 22 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 70c5bcf..53ba73c 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -153,6 +153,8 @@ namespace gbe
>      const GenRegister src0 = ra->genReg(insn.src(0));
>      const GenRegister src1 = ra->genReg(insn.src(1));
>      switch (insn.opcode) {
> +      case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src1,
> src0.value.df); break;
> +      case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
>        case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
>        case SEL_OP_AND:  p->AND(dst, src0, src1); break;
>        case SEL_OP_OR:   p->OR (dst, src0, src1);  break;
> @@ -269,6 +271,14 @@ namespace gbe
>      p->pop();
>    }
> 
> +  void GenContext::emitReadFloat64Instruction(const SelectionInstruction
> &insn) {
> +    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const GenRegister src = ra->genReg(insn.src(0));
> +    const uint32_t bti = insn.extra.function;
> +    const uint32_t elemNum = insn.extra.elem;
> +    p->READ_FLOAT64(dst, src, bti, elemNum);  }
> +
>    void GenContext::emitUntypedReadInstruction(const SelectionInstruction
> &insn) {
>      const GenRegister dst = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0)); @@ -277,6 +287,13
> @@ namespace gbe
>      p->UNTYPED_READ(dst, src, bti, elemNum);
>    }
> 
> +  void GenContext::emitWriteFloat64Instruction(const SelectionInstruction
> &insn) {
> +    const GenRegister src = ra->genReg(insn.src(0));
> +    const uint32_t bti = insn.extra.function;
> +    const uint32_t elemNum = insn.extra.elem;
> +    p->WRITE_FLOAT64(src, bti, elemNum);  }
> +
>    void GenContext::emitUntypedWriteInstruction(const SelectionInstruction
> &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
>      const uint32_t bti = insn.extra.function; diff --git
> a/backend/src/backend/gen_context.hpp
> b/backend/src/backend/gen_context.hpp
> index 1566cbb..804384d 100644
> --- a/backend/src/backend/gen_context.hpp
> +++ b/backend/src/backend/gen_context.hpp
> @@ -87,6 +87,8 @@ namespace gbe
>      void emitBarrierInstruction(const SelectionInstruction &insn);
>      void emitFenceInstruction(const SelectionInstruction &insn);
>      void emitMathInstruction(const SelectionInstruction &insn);
> +    void emitReadFloat64Instruction(const SelectionInstruction &insn);
> +    void emitWriteFloat64Instruction(const SelectionInstruction &insn);
>      void emitUntypedReadInstruction(const SelectionInstruction &insn);
>      void emitUntypedWriteInstruction(const SelectionInstruction &insn);
>      void emitByteGatherInstruction(const SelectionInstruction &insn);
diff
> --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> index f4e4938..9d8db5b 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -215,6 +215,7 @@ enum GenMessageTarget {  #define GEN_TYPE_VF
> 5 /* packed float vector, immediates only? */  #define GEN_TYPE_HF  6
>  #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest
> only */
> +#define GEN_TYPE_DF  6
>  #define GEN_TYPE_F   7
> 
>  #define GEN_ARF_NULL                  0x00
> @@ -303,6 +304,7 @@ enum GenMessageTarget {
>  #define GEN_BYTE_SCATTER_BYTE   0
>  #define GEN_BYTE_SCATTER_WORD   1
>  #define GEN_BYTE_SCATTER_DWORD  2
> +#define GEN_BYTE_SCATTER_QWORD  3
> 
>  #define GEN_SAMPLER_RETURN_FORMAT_FLOAT32     0
>  #define GEN_SAMPLER_RETURN_FORMAT_UINT32      2
> @@ -418,7 +420,7 @@ struct GenInstruction
>        uint32_t src0_reg_type:3;
>        uint32_t src1_reg_file:2;
>        uint32_t src1_reg_type:3;
> -      uint32_t pad:1;
> +      uint32_t nib_ctrl:1;
>        uint32_t dest_subreg_nr:5;
>        uint32_t dest_reg_nr:8;
>        uint32_t dest_horiz_stride:2;
> @@ -432,7 +434,7 @@ struct GenInstruction
>        uint32_t src0_reg_type:3;
>        uint32_t src1_reg_file:2;        /* 0x00000c00 */
>        uint32_t src1_reg_type:3;        /* 0x00007000 */
> -      uint32_t pad:1;
> +      uint32_t nib_ctrl:1;
>        int dest_indirect_offset:10;        /* offset against the deref'd
> address reg */
>        uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
>        uint32_t dest_horiz_stride:2;
> @@ -446,7 +448,7 @@ struct GenInstruction
>        uint32_t src0_reg_type:3;
>        uint32_t src1_reg_file:2;
>        uint32_t src1_reg_type:3;
> -      uint32_t pad:1;
> +      uint32_t nib_ctrl:1;
>        uint32_t dest_writemask:4;
>        uint32_t dest_subreg_nr:1;
>        uint32_t dest_reg_nr:8;
> @@ -459,7 +461,7 @@ struct GenInstruction
>        uint32_t dest_reg_type:3;
>        uint32_t src0_reg_file:2;
>        uint32_t src0_reg_type:3;
> -      uint32_t pad0:6;
> +      uint32_t nib_ctrl:1;
>        uint32_t dest_writemask:4;
>        int dest_indirect_offset:6;
>        uint32_t dest_subreg_nr:3;
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index 859a1b9..3d8afe8 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -235,6 +235,7 @@ namespace gbe
>        NOT_IMPLEMENTED;
>      insn->header.acc_wr_control = this->curr.accWrEnable;
>      insn->header.quarter_control = this->curr.quarterControl;
> +    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
>      insn->header.mask_control = this->curr.noMask;
>      insn->bits2.ia1.flag_reg_nr = this->curr.flag;
>      insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag; @@ -355,6
> +356,105 @@ namespace gbe
>      0
>    };
> 
> +  static int dst_type(int exec_width) {
> +    if (exec_width == 8)
> +      return GEN_TYPE_UD;
> +    if (exec_width == 16)
> +      return GEN_TYPE_UW;
> +    NOT_IMPLEMENTED;
> +    return 0;
> +  }
> +
> +  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src,
> uint32_t bti, uint32_t elemNum) {
> +    int w = curr.execWidth;
> +    dst = GenRegister::h2(dst);
> +    dst.type = GEN_TYPE_UD;
> +    src.type = GEN_TYPE_UD;
> +    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2),
> GEN_TYPE_UD);
> +    GenRegister imm4 = GenRegister::immud(4);
> +    GenInstruction *insn;
> +    insn = next(GEN_OPCODE_SEND);
> +    setHeader(insn);
> +    setDst(insn, GenRegister::uw16grf(r.nr, 0));
> +    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    setSrc1(insn, GenRegister::immud(0));
> +    setDPUntypedRW(this, insn, bti, untypedRWMask[1],
> GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
> +    push();
> +    curr.quarterControl = 0;
> +    curr.nibControl = 0;
> +    MOV(dst, r);
> +    if (w == 8)
> +      curr.nibControl = 1;
> +    else
> +      curr.quarterControl = 1;
> +    MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w /
2));
> +    pop();
> +    ADD(src, src, imm4);
> +    insn = next(GEN_OPCODE_SEND);
> +    setHeader(insn);
> +    setDst(insn, GenRegister::uw16grf(r.nr, 0));
> +    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> +    setSrc1(insn, GenRegister::immud(0));
> +    setDPUntypedRW(this, insn, bti, untypedRWMask[1],
> GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
> +    push();
> +    curr.quarterControl = 0;
> +    curr.nibControl = 0;
> +    MOV(GenRegister::suboffset(dst, 1), r);
> +    if (w == 8)
> +      curr.nibControl = 1;
> +    else
> +      curr.quarterControl = 1;
> +    MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w /
> 2));
> +    pop();
> +  }
> +
> +  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t
> elemNum) {
> +    int w = curr.execWidth;
> +    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3),
> GEN_TYPE_UD);
> +    r.type = GEN_TYPE_UD;
> +    GenRegister hdr = GenRegister::h2(r);
> +    GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
> +    src.hstride = GEN_HORIZONTAL_STRIDE_2;
> +    GenRegister data = GenRegister::offset(r, w / 8);
> +    GenRegister imm4 = GenRegister::immud(4);
> +    MOV(r, GenRegister::ud8grf(msg.nr, 0));
> +    push();
> +    curr.quarterControl = 0;
> +    curr.nibControl = 0;
> +    MOV(data, src);
> +    if (w == 8)
> +      curr.nibControl = 1;
> +    else
> +      curr.quarterControl = 1;
> +    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src,
> w));
> +    pop();
> +    GenInstruction *insn;
> +    insn = next(GEN_OPCODE_SEND);
> +    setHeader(insn);
> +    setDst(insn, GenRegister::retype(GenRegister::null(),
> dst_type(curr.execWidth)));
> +    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
> +    setSrc1(insn, GenRegister::immud(0));
> +    setDPUntypedRW(this, insn, bti, untypedRWMask[1],
> + GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
> +
> +    ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
> +    push();
> +    curr.quarterControl = 0;
> +    curr.nibControl = 0;
> +    MOV(data, GenRegister::suboffset(src, 1));
> +    if (w == 8)
> +      curr.nibControl = 1;
> +    else
> +      curr.quarterControl = 1;
> +    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src,
w
> + 1));
> +    pop();
> +    insn = next(GEN_OPCODE_SEND);
> +    setHeader(insn);
> +    setDst(insn, GenRegister::retype(GenRegister::null(),
> dst_type(curr.execWidth)));
> +    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
> +    setSrc1(insn, GenRegister::immud(0));
> +    setDPUntypedRW(this, insn, bti, untypedRWMask[1],
> + GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);  }
> +
>    void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src,
> uint32_t bti, uint32_t elemNum) {
>      GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4); @@ -467,7 +567,25 @@
> namespace gbe
>    }
> 
>    INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
> GenRegister src) {
> -     if (needToSplitAlu1(p, dst, src) == false) {
> +     if (dst.isdf() && src.isdf()) {
> +       int w = p->curr.execWidth;
> +       p->push();
> +       p->curr.quarterControl = 0;
> +       p->curr.nibControl = 0;
> +       GenInstruction *insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, dst);
> +       p->setSrc0(insn, src);
> +       if (w == 8)
> +         p->curr.nibControl = 1; // second 1/8 mask
> +       else // w == 16
> +         p->curr.quarterControl = 1; // second 1/4 mask
> +       insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
> +       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
> +       p->pop();
> +     } else if (needToSplitAlu1(p, dst, src) == false) {
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
>         p->setDst(insn, dst);
> @@ -499,7 +617,27 @@ namespace gbe
>                     GenRegister src0,
>                     GenRegister src1)
>    {
> -    if (needToSplitAlu2(p, dst, src0, src1) == false) {
> +    if (dst.isdf() && src0.isdf() && src1.isdf()) {
> +       int w = p->curr.execWidth;
> +       p->push();
> +       p->curr.quarterControl = 0;
> +       p->curr.nibControl = 0;
> +       GenInstruction *insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, dst);
> +       p->setSrc0(insn, src0);
> +       p->setSrc1(insn, src1);
> +       if (w == 8)
> +         p->curr.nibControl = 1; // second 1/8 mask
> +       else // w == 16
> +         p->curr.quarterControl = 1; // second 1/4 mask
> +       insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
> +       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
> +       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
> +       p->pop();
> +    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
>         p->setDst(insn, dst);
> @@ -620,6 +758,67 @@ namespace gbe
>      alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
>    }
> 
> +  void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp,
> double value) {
> +    union { double d; unsigned u[2]; } u;
> +    u.d = value;
> +    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
> +    push();
> +    curr.predicate = GEN_PREDICATE_NONE;
> +    curr.execWidth = 1;
> +    MOV(r, GenRegister::immud(u.u[1]));
> +    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
> +    pop();
> +    r.type = GEN_TYPE_DF;
> +    r.vstride = GEN_VERTICAL_STRIDE_0;
> +    r.width = GEN_WIDTH_1;
> +    r.hstride = GEN_HORIZONTAL_STRIDE_0;
> +    push();
> +    MOV(dest, r);
> +    pop();
> +  }
> +
> +  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0,
> GenRegister r) {
> +    int w = curr.execWidth;
> +    if (src0.isdf()) {
> +      push();
> +      curr.execWidth = 16;
> +      MOV(dest, src0);
> +      if (w == 16) {
> +        curr.quarterControl = 1;
> +        MOV(GenRegister::QnPhysical(dest, w / 4),
> GenRegister::QnPhysical(src0, w / 4));
> +      }
> +      pop();
> +    } else {
> +      GenRegister r0 = GenRegister::h2(r);
> +      push();
> +      curr.execWidth = 8;
> +      curr.predicate = GEN_PREDICATE_NONE;
> +      MOV(r0, src0);
> +      MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0,
4));
> +      curr.predicate = GEN_PREDICATE_NORMAL;
> +      curr.quarterControl = 0;
> +      curr.nibControl = 0;
> +      MOV(dest, r);
> +      curr.nibControl = 1;
> +      MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8));
> +      pop();
> +      if (w == 16) {
> +        push();
> +        curr.execWidth = 8;
> +        curr.predicate = GEN_PREDICATE_NONE;
> +        MOV(r0, GenRegister::suboffset(src0, 8));
> +        MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0,
> 12));
> +        curr.predicate = GEN_PREDICATE_NORMAL;
> +        curr.quarterControl = 1;
> +        curr.nibControl = 0;
> +        MOV(GenRegister::suboffset(dest, 8), r);
> +        curr.nibControl = 1;
> +        MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r,
> 8));
> +        pop();
> +      }
> +    }
> +  }
> +
>    ALU1(MOV)
>    ALU1(RNDZ)
>    ALU1(RNDE)
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index c98774f..1a5dcf9 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -113,9 +113,11 @@ namespace gbe
>      ALU2(LINE)
>      ALU2(PLN)
>      ALU3(MAD)
> +    ALU2(MOV_DF);
>  #undef ALU1
>  #undef ALU2
>  #undef ALU3
> +    void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
>      /*! Barrier message (to synchronize threads of a workgroup) */
>      void BARRIER(GenRegister src);
>      /*! Memory fence message (to order loads and stores between threads)
> */ @@ -132,6 +134,10 @@ namespace gbe
>      void NOP(void);
>      /*! Wait instruction (used for the barrier) */
>      void WAIT(void);
> +    /*! Read 64-bits float arrays */
> +    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti,
> uint32_t elemNum);
> +    /*! Write 64-bits float arrays */
> +    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t
> + elemNum);
>      /*! Untyped read (upto 4 channels) */
>      void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti,
> uint32_t elemNum);
>      /*! Untyped write (upto 4 channels) */ diff --git
> a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> index 098d9ec..a3b4621 100644
> --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
> @@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait,            20,
> 2,        2)
>  DECL_GEN7_SCHEDULE(Math,            20,        4,        2)
>  DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
>  DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
> +DECL_GEN7_SCHEDULE(ReadFloat64,     80,        1,        1)
> +DECL_GEN7_SCHEDULE(WriteFloat64,    80,        1,        1)
>  DECL_GEN7_SCHEDULE(UntypedRead,     80,        1,        1)
>  DECL_GEN7_SCHEDULE(UntypedWrite,    80,        1,        1)
>  DECL_GEN7_SCHEDULE(ByteGather,      80,        1,        1)
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 4e7cebd..5901419 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -129,6 +129,7 @@ namespace gbe
>        case TYPE_S32: return GEN_TYPE_D;
>        case TYPE_U32: return GEN_TYPE_UD;
>        case TYPE_FLOAT: return GEN_TYPE_F;
> +      case TYPE_DOUBLE: return GEN_TYPE_DF;
>        default: NOT_SUPPORTED; return GEN_TYPE_F;
>      }
>    }
> @@ -166,11 +167,13 @@ namespace gbe
> 
>    bool SelectionInstruction::isRead(void) const {
>      return this->opcode == SEL_OP_UNTYPED_READ ||
> +           this->opcode == SEL_OP_READ_FLOAT64 ||
>             this->opcode == SEL_OP_BYTE_GATHER;
>    }
> 
>    bool SelectionInstruction::isWrite(void) const {
>      return this->opcode == SEL_OP_UNTYPED_WRITE ||
> +           this->opcode == SEL_OP_WRITE_FLOAT64 ||
>             this->opcode == SEL_OP_BYTE_SCATTER;
>    }
> 
> @@ -406,6 +409,8 @@ namespace gbe
>  #define ALU3(OP) \
>    INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) {
ALU3(SEL_OP_##OP,
> dst, src0, src1, src2); }
>      ALU1(MOV)
> +    ALU2(MOV_DF)
> +    ALU2(LOAD_DF_IMM)
>      ALU1(RNDZ)
>      ALU1(RNDE)
>      ALU2(SEL)
> @@ -449,6 +454,10 @@ namespace gbe
>      void NOP(void);
>      /*! Wait instruction (used for the barrier) */
>      void WAIT(void);
> +    /*! Read 64 bits float array */
> +    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t
> elemNum, uint32_t bti);
> +    /*! Write 64 bits float array */
> +    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t
> + elemNum, uint32_t bti);
>      /*! Untyped read (up to 4 elements) */
>      void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t
> elemNum, uint32_t bti);
>      /*! Untyped write (up to 4 elements) */ @@ -610,20 +619,23 @@
> namespace gbe
> 
>    ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn,
> uint32_t regID) {
>      SelectionBlock *block = insn->parent;
> -    const uint32_t simdWidth = ctx.getSimdWidth();
> +    uint32_t simdWidth = ctx.getSimdWidth();
>      ir::Register tmp;
> +    ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
> +    int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
> +    GenRegister gr;
> 
>      // This will append the temporary register in the instruction block
>      this->block = block;
> -    tmp = this->reg(ir::FAMILY_DWORD);
> +    tmp = this->reg(f);
> 
>      // Generate the MOV instruction and replace the register in the
> instruction
>      SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
> -    mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F);
> +    mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
>      mov->state = GenInstructionState(simdWidth);
> -    insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp);
> +    gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) :
> GenRegister::fxgrf(simdWidth, tmp);
> +    insn->dst(regID) = mov->src(0) = gr;
>      insn->append(*mov);
> -
>      return tmp;
>    }
> 
> @@ -657,6 +669,7 @@ namespace gbe
>        case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
>        case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
>        case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
> +      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
>        default: NOT_SUPPORTED;
>      }
>      GBE_ASSERT(false);
> @@ -719,6 +732,33 @@ namespace gbe
>    void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0);
}
>    void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0,
> 0); }
> 
> +  void Selection::Opaque::READ_FLOAT64(Reg addr,
> +                                       const GenRegister *dst,
> +                                       uint32_t elemNum,
> +                                       uint32_t bti)  {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64,
> elemNum, 1);
> +    SelectionVector *srcVector = this->appendVector();
> +    SelectionVector *dstVector = this->appendVector();
> +
> +    // Regular instruction to encode
> +    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
> +      insn->dst(elemID) = dst[elemID];
> +    insn->src(0) = addr;
> +    insn->extra.function = bti;
> +    insn->extra.elem = elemNum;
> +
> +    // Sends require contiguous allocation
> +    dstVector->regNum = elemNum;
> +    dstVector->isSrc = 0;
> +    dstVector->reg = &insn->dst(0);
> +
> +    // Source cannot be scalar (yet)
> +    srcVector->regNum = 1;
> +    srcVector->isSrc = 1;
> +    srcVector->reg = &insn->src(0);
> +  }
> +
>    void Selection::Opaque::UNTYPED_READ(Reg addr,
>                                         const GenRegister *dst,
>                                         uint32_t elemNum, @@
> -746,6 +786,27 @@ namespace gbe
>      srcVector->reg = &insn->src(0);
>    }
> 
> +  void Selection::Opaque::WRITE_FLOAT64(Reg addr,
> +                                        const GenRegister *src,
> +                                        uint32_t elemNum,
> +                                        uint32_t bti)  {
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64,
> 0, elemNum+1);
> +    SelectionVector *vector = this->appendVector();
> +
> +    // Regular instruction to encode
> +    insn->src(0) = addr;
> +    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
> +      insn->src(elemID+1) = src[elemID];
> +    insn->extra.function = bti;
> +    insn->extra.elem = elemNum;
> +
> +    // Sends require contiguous allocation for the sources
> +    vector->regNum = elemNum+1;
> +    vector->reg = &insn->src(0);
> +    vector->isSrc = 1;
> +  }
> +
>    void Selection::Opaque::UNTYPED_WRITE(Reg addr,
>                                          const GenRegister *src,
>                                          uint32_t elemNum, @@
> -1092,6 +1153,15 @@ namespace gbe
>    // Implementation of all patterns
>
///////////////////////////////////////////////////////////////////////////
> 
> +  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
> +    using namespace ir;
> +    const auto &childInsn = cast<LoadImmInstruction>(insn);
> +    const auto &imm = childInsn.getImmediate();
> +    if(imm.type != TYPE_DOUBLE)
> +      return true;
> +    return false;
> +  }
> +
>    GenRegister getRegisterFromImmediate(ir::Immediate imm)
>    {
>      using namespace ir;
> @@ -1103,6 +1173,7 @@ namespace gbe
>        case TYPE_S16: return  GenRegister::immw(imm.data.s16);
>        case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
>        case TYPE_S8:  return GenRegister::immw(imm.data.s8);
> +      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
>        default: NOT_SUPPORTED; return GenRegister::immuw(0);
>      }
>    }
> @@ -1146,7 +1217,13 @@ namespace gbe
>        const GenRegister src = sel.selReg(insn.getSrc(0));
>        switch (opcode) {
>          case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
> -        case ir::OP_MOV: sel.MOV(dst, src); break;
> +        case ir::OP_MOV:
> +          if (dst.isdf()) {
> +            ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
> +            sel.MOV_DF(dst, src, sel.selReg(r));
> +          } else
> +            sel.MOV(dst, src);
> +          break;
>          case ir::OP_RNDD: sel.RNDD(dst, src); break;
>          case ir::OP_RNDE: sel.RNDE(dst, src); break;
>          case ir::OP_RNDU: sel.RNDU(dst, src); break; @@ -1225,14
> +1302,14 @@ namespace gbe
>        SelectionDAG *dag1 = dag.child[1];
> 
>        // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> dag1->insn.getOpcode() == OP_LOADI) {
> +      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> + dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
>          src0 = sel.selReg(insn.getSrc(0), type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
>          if (dag0) dag0->isRoot = 1;
>        }
>        // Left source cannot be immediate but it is OK if we can commute
> -      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
> +      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> + insn.commutes() && dag0->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag0->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
>          src0 = sel.selReg(insn.getSrc(1), type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
> @@ -1268,7 +1345,7 @@ namespace gbe
>          case OP_SHR: sel.SHR(dst, src0, src1); break;
>          case OP_ASR: sel.ASR(dst, src0, src1); break;
>          case OP_MUL:
> -          if (type == TYPE_FLOAT)
> +          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
>              sel.MUL(dst, src0, src1);
>            else if (type == TYPE_U32 || type == TYPE_S32) {
>              sel.pop();
> @@ -1599,6 +1676,7 @@ namespace gbe
>          case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16));
> break;
>          case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8));
> break;
>          case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8));
> break;
> +        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst,
> + GenRegister::immdf(imm.data.f64), sel.selReg(sel.reg(FAMILY_QWORD)));
> + break;
>          default: NOT_SUPPORTED;
>        }
>        sel.pop();
> @@ -1650,6 +1728,8 @@ namespace gbe
>    INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
>      using namespace ir;
>      switch (type) {
> +      case TYPE_DOUBLE:
> +        return GEN_BYTE_SCATTER_QWORD;
>        case TYPE_FLOAT:
>        case TYPE_U32:
>        case TYPE_S32:
> @@ -1681,6 +1761,22 @@ namespace gbe
>        sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
>      }
> 
> +    void emitReadFloat64(Selection::Opaque &sel,
> +                         const ir::LoadInstruction &insn,
> +                         GenRegister addr,
> +                         uint32_t bti) const
> +    {
> +      using namespace ir;
> +      const uint32_t valueNum = insn.getValueNum();
> +      vector<GenRegister> dst(valueNum);
> +      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
> +        dst[dstID] =
GenRegister::retype(sel.selReg(insn.getValue(dstID)),
> GEN_TYPE_F);
> +      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> +      if (sel.ctx.getSimdWidth() == 16)
> +        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> +      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
> +    }
> +
>      void emitByteGather(Selection::Opaque &sel,
>                          const ir::LoadInstruction &insn,
>                          const uint32_t elemSize, @@ -1732,6 +1828,8
> @@ namespace gbe
>        const uint32_t elemSize = getByteScatterGatherSize(type);
>        if (insn.getAddressSpace() == MEM_CONSTANT)
>          this->emitIndirectMove(sel, insn, address);
> +      else if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_QWORD)
> +        this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ?
> + 0xfe : 0x00);
>        else if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_DWORD)
>          this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ?
> 0xfe : 0x00);
>        else {
> @@ -1762,6 +1860,25 @@ namespace gbe
>        sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
>      }
> 
> +    void emitWriteFloat64(Selection::Opaque &sel,
> +                          const ir::StoreInstruction &insn,
> +                          uint32_t bti) const
> +    {
> +      using namespace ir;
> +      const uint32_t valueNum = insn.getValueNum();
> +      const uint32_t addrID = ir::StoreInstruction::addressIndex;
> +      GenRegister addr;
> +      vector<GenRegister> value(valueNum);
> +
> +      addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)),
> GEN_TYPE_F);
> +      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
> +        value[valueID] =
> GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
> +      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> +      if (sel.ctx.getSimdWidth() == 16)
> +        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> +      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
> +    }
> +
>      void emitByteScatter(Selection::Opaque &sel,
>                           const ir::StoreInstruction &insn,
>                           const uint32_t elemSize, @@ -1791,7 +1908,9
> @@ namespace gbe
>        const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
>        const Type type = insn.getValueType();
>        const uint32_t elemSize = getByteScatterGatherSize(type);
> -      if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_DWORD)
> +      if (insn.isAligned() == true && elemSize ==
> GEN_BYTE_SCATTER_QWORD)
> +        this->emitWriteFloat64(sel, insn, bti);
> +      else if (insn.isAligned() == true && elemSize ==
> + GEN_BYTE_SCATTER_DWORD)
>          this->emitUntypedWrite(sel, insn, bti);
>        else {
>          const GenRegister address = sel.selReg(insn.getAddress()); @@
> -1839,7 +1958,7 @@ namespace gbe
>        SelectionDAG *dag1 = dag.child[1];
> 
>        // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> dag1->insn.getOpcode() == OP_LOADI) {
> +      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> + dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
>          src0 = sel.selReg(insn.getSrc(0), type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
> @@ -1873,7 +1992,7 @@ namespace gbe
>        const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
> 
>        // We need two instructions to make the conversion
> -      if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
> +      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD &&
> + srcFamily == FAMILY_DWORD) {
>          GenRegister unpacked;
>          if (dstFamily == FAMILY_WORD) {
>            const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
> @@ -1886,6 +2005,9 @@ namespace gbe
>          }
>          sel.MOV(unpacked, src);
>          sel.MOV(dst, unpacked);
> +      } else if (dst.isdf()) {
> +        ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
> +        sel.MOV_DF(dst, src, sel.selReg(r));
>        } else
>          sel.MOV(dst, src);
>        return true;
> @@ -1919,7 +2041,7 @@ namespace gbe
>        SelectionDAG *dag2 = dag.child[2];
> 
>        // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL &&
> dag2->insn.getOpcode() == OP_LOADI) {
> +      if (OCL_OPTIMIZE_IMMEDIATE && dag2 != NULL &&
> + dag2->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag2->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag2->insn);
>          src0 = sel.selReg(insn.getSrc(SelectInstruction::src0Index),
type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
> diff --git a/backend/src/backend/gen_insn_selection.hxx
> b/backend/src/backend/gen_insn_selection.hxx
> index 789c81c..4b5525b 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -1,5 +1,7 @@
>  DECL_SELECTION_IR(LABEL, LabelInstruction)  DECL_SELECTION_IR(MOV,
> UnaryInstruction)
> +DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
> +DECL_SELECTION_IR(LOAD_DF_IMM, BinaryInstruction)
>  DECL_SELECTION_IR(NOT, UnaryInstruction)  DECL_SELECTION_IR(LZD,
> UnaryInstruction)  DECL_SELECTION_IR(RNDZ, UnaryInstruction) @@ -32,6
> +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
> DECL_SELECTION_IR(FENCE, FenceInstruction)
> DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
> DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
> +DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
> +DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
>  DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
> DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
> DECL_SELECTION_IR(SAMPLE, SampleInstruction) diff --git
> a/backend/src/backend/gen_reg_allocation.cpp
> b/backend/src/backend/gen_reg_allocation.cpp
> index 9765b02..e7c96ac 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -458,7 +458,6 @@ namespace gbe
>    }
> 
>    bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
> -
>      // Perform the linear scan allocator
>      const uint32_t regNum = ctx.sel->getRegNum();
>      for (uint32_t startID = 0; startID < regNum; ++startID) { diff --git
> a/backend/src/backend/gen_register.hpp
> b/backend/src/backend/gen_register.hpp
> index d772b0d..fedb743 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -69,11 +69,12 @@ namespace gbe
>    /*! Type size in bytes for each Gen type */
>    INLINE int typeSize(uint32_t type) {
>      switch(type) {
> +      case GEN_TYPE_DF:
> +        return 8;
>        case GEN_TYPE_UD:
>        case GEN_TYPE_D:
>        case GEN_TYPE_F:
>          return 4;
> -      case GEN_TYPE_HF:
>        case GEN_TYPE_UW:
>        case GEN_TYPE_W:
>          return 2;
> @@ -110,6 +111,7 @@ namespace gbe
>      INLINE GenInstructionState(uint32_t simdWidth = 8) {
>        this->execWidth = simdWidth;
>        this->quarterControl = GEN_COMPRESSION_Q1;
> +      this->nibControl = 0;
>        this->accWrEnable = 0;
>        this->noMask = 0;
>        this->flag = 0;
> @@ -126,6 +128,7 @@ namespace gbe
>      uint32_t flagIndex:16;   //!< Only if virtual flag (index of the
register)
>      uint32_t execWidth:5;
>      uint32_t quarterControl:1;
> +    uint32_t nibControl:1;
>      uint32_t accWrEnable:1;
>      uint32_t noMask:1;
>      uint32_t predicate:4;
> @@ -192,6 +195,7 @@ namespace gbe
> 
>      /*! For immediates or virtual register */
>      union {
> +      double df;
>        float f;
>        int32_t d;
>        uint32_t ud;
> @@ -211,6 +215,31 @@ namespace gbe
>      uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
>      uint32_t address_mode:1; //!< direct or indirect
> 
> +    static INLINE GenRegister offset(GenRegister reg, int nr, int subnr =
0) {
> +      GenRegister r = reg;
> +      r.nr += nr;
> +      r.subnr += subnr;
> +      return r;
> +    }
> +
> +    INLINE bool isimmdf(void) const {
> +      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
> +        return true;
> +      return false;
> +    }
> +
> +    INLINE bool isdf(void) const {
> +      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
> +        return true;
> +      return false;
> +    }
> +
> +    static INLINE GenRegister h2(GenRegister reg) {
> +      GenRegister r = reg;
> +      r.hstride = GEN_HORIZONTAL_STRIDE_2;
> +      return r;
> +    }
> +
>      static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t
quarter) {
>        GBE_ASSERT(reg.physical == 0);
>        if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register @@
> -293,6 +322,18 @@ namespace gbe
>        return reg;
>      }
> 
> +    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
> +      return retype(vec16(file, reg), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
> +      return retype(vec8(file, reg), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
> +      return retype(vec1(file, reg), GEN_TYPE_DF);
> +    }
> +
>      static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
>        return retype(vec16(file, reg), GEN_TYPE_UD);
>      }
> @@ -371,6 +412,12 @@ namespace gbe
>                           GEN_HORIZONTAL_STRIDE_0);
>      }
> 
> +    static INLINE GenRegister immdf(double df) {
> +      GenRegister immediate = imm(GEN_TYPE_DF);
> +      immediate.value.df = df;
> +      return immediate;
> +    }
> +
>      static INLINE GenRegister immf(float f) {
>        GenRegister immediate = imm(GEN_TYPE_F);
>        immediate.value.f = f;
> @@ -448,6 +495,18 @@ namespace gbe
>        return vec16(GEN_GENERAL_REGISTER_FILE, reg);
>      }
> 
> +    static INLINE GenRegister df1grf(ir::Register reg) {
> +      return df1(GEN_GENERAL_REGISTER_FILE, reg);
> +    }
> +
> +    static INLINE GenRegister df8grf(ir::Register reg) {
> +      return df8(GEN_GENERAL_REGISTER_FILE, reg);
> +    }
> +
> +    static INLINE GenRegister df16grf(ir::Register reg) {
> +      return df16(GEN_GENERAL_REGISTER_FILE, reg);
> +    }
> +
>      static INLINE GenRegister ud16grf(ir::Register reg) {
>        return ud16(GEN_GENERAL_REGISTER_FILE, reg);
>      }
> @@ -608,11 +667,37 @@ namespace gbe
>                      GEN_HORIZONTAL_STRIDE_0);
>      }
> 
> +    static INLINE int hstride_size(GenRegister reg) {
> +      switch (reg.hstride) {
> +        case GEN_HORIZONTAL_STRIDE_0: return 0;
> +        case GEN_HORIZONTAL_STRIDE_1: return 1;
> +        case GEN_HORIZONTAL_STRIDE_2: return 2;
> +        case GEN_HORIZONTAL_STRIDE_4: return 4;
> +        default: NOT_IMPLEMENTED; return 0;
> +      }
> +    }
> +
>      static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta)
{
> -      reg.subnr += delta * typeSize(reg.type);
> +      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
> +        reg.subnr += delta * typeSize(reg.type);
> +        reg.nr += reg.subnr / 32;
> +        reg.subnr %= 32;
> +      }
>        return reg;
>      }
> 
> +    static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t
subnr)
> {
> +      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t
subnr) {
> +      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t
subnr) {
> +      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
> +    }
> +
>      static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t
subnr)
> {
>        return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
>      }
> @@ -685,6 +770,18 @@ namespace gbe
>        return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
>      }
> 
> +    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
> +      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> +    }
> +
> +    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
> +      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> +    }
> +
> +    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
> +      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> +    }
> +
>      static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
>        return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
>      }
> @@ -790,6 +887,7 @@ namespace gbe
>          return SIMD1(values...); \
>        } \
>      }
> +    DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
>      DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
>      DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
>      DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
> --
> 1.8.1.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet



More information about the Beignet mailing list