[Beignet] [PATCH 1/2] Support 64-bit float

Thu Jun 6 01:07:56 PDT 2013

I will refine the commit log, and check the problems.

If you have better idea about how to implement 64-bit float function please tell me. Thanks.

-----Original Message-----
From: Zhigang Gong [mailto:zhigang.gong at linux.intel.com] 
Sent: Thursday, June 06, 2013 3:59 PM
To: Xing, Homer; beignet at lists.freedesktop.org
Subject: RE: [Beignet] [PATCH 1/2] Support 64-bit float

The commit log only says it support 64-bit float immediate? Is it the only goal of this commit? I'm afraid not, could you refine the log to introduce this commit more accurate?

And I still haven't review the whole commit. I just tried to run the test case on my machine (32 bit with llvm 3.3), And I found the following
problems:

By default, it ran into one failure:
compiler_menger_sponge_no_shadow:
  compiler_menger_sponge_no_shadow()    [FAILED]
    Error: image mismatch
  at file /home/gong/git/fdo/beignet/utests/compiler_shader_toy.cpp,
function run_kernel, line 63

By set the OCL_SIMD_WIDTH=8, it got another failure:
compiler_double:
utest_run:
/home/gong/git/fdo/beignet/backend/src/backend/gen_encoder.cpp:250: void gbe::GenEncoder::setDst(GenInstruction*, gbe::GenRegister): Assertion `dest.nr < 128' failed.
Aborted (core dumped)
But it seems the first failure get passed with SIMD8 mode.

Could you look at the above problems?

> -----Original Message-----
> From: 
> beignet-bounces+zhigang.gong=linux.intel.com at lists.freedesktop.org
>
[mailto:beignet-bounces+zhigang.gong=linux.intel.com at lists.freedesktop.org]
> On Behalf Of Homer Hsing
> Sent: Thursday, June 06, 2013 2:16 PM
> To: beignet at lists.freedesktop.org
> Cc: Homer Hsing
> Subject: [Beignet] [PATCH 1/2] Support 64-bit float
> 
> support 64-bit float immediate
> 
> example:
> 
>   kernel void f(global double *src, global double *dst) {
>     int i = get_global_id(0);
>     double d = 1.234567890123456789;
>     dst[i] = d * (src[i] + d);
>   }
> 
> Signed-off-by: Homer Hsing <homer.xing at intel.com>
> ---
>  backend/src/backend/gen_context.cpp        |  16 ++++-
>  backend/src/backend/gen_defs.hpp           |   1 +
>  backend/src/backend/gen_encoder.cpp        | 107
> +++++++++++++++++++++++++++--
>  backend/src/backend/gen_encoder.hpp        |   6 +-
>  backend/src/backend/gen_insn_selection.cpp |  46 +++++++++----
>  backend/src/backend/gen_insn_selection.hpp |   4 ++
>  backend/src/backend/gen_reg_allocation.cpp |  10 ++-
>  backend/src/backend/gen_register.hpp       |  80
> +++++++++++++++++++++
>  backend/src/ir/profile.cpp                 |   2 +
>  backend/src/ir/profile.hpp                 |   3 +-
>  10 files changed, 249 insertions(+), 26 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 055c8fc..e6080b2 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -138,7 +138,15 @@ namespace gbe
>      const GenRegister dst = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0));
>      switch (insn.opcode) {
> -      case SEL_OP_MOV: p->MOV(dst, src); break;
> +      case SEL_OP_MOV:
> +        if (dst.isdf() && !src.isdf()) {
> +          bool doubleio = true;
> +          const GenRegister r =
> ra->genReg(GenRegister::f16grf(ir::ocl::doubleio));
> +          p->MOV(dst, src, doubleio, r);
> +        } else {
> +          p->MOV(dst, src);
> +        }
> +        break;
>        case SEL_OP_NOT: p->NOT(dst, src); break;
>        case SEL_OP_RNDD: p->RNDD(dst, src); break;
>        case SEL_OP_RNDU: p->RNDU(dst, src); break; @@ -263,14 +271,16 
> @@ namespace gbe
>      const GenRegister src = ra->genReg(insn.src(0));
>      const uint32_t bti = insn.extra.function;
>      const uint32_t elemNum = insn.extra.elem;
> -    p->UNTYPED_READ(dst, src, bti, elemNum);
> +    const GenRegister r =
> ra->genReg(GenRegister::ud16grf(ir::ocl::doubleio));
> +    p->UNTYPED_READ(dst, src, bti, elemNum, insn.doubleio, r);
>    }
> 
>    void GenContext::emitUntypedWriteInstruction(const 
> SelectionInstruction
> &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
>      const uint32_t bti = insn.extra.function;
>      const uint32_t elemNum = insn.extra.elem;
> -    p->UNTYPED_WRITE(src, bti, elemNum);
> +    const GenRegister r =
> ra->genReg(GenRegister::ud16grf(ir::ocl::doubleio));
> +    p->UNTYPED_WRITE(src, bti, elemNum, insn.doubleio, r);
>    }
> 
>    void GenContext::emitByteGatherInstruction(const 
> SelectionInstruction
> &insn) { diff --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> index c7a1581..63f98f5 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -215,6 +215,7 @@ enum GenMessageTarget {  #define GEN_TYPE_VF
> 5 /* packed float vector, immediates only? */  #define GEN_TYPE_HF  6
>  #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest
> only */
> +#define GEN_TYPE_DF  6
>  #define GEN_TYPE_F   7
> 
>  #define GEN_ARF_NULL                  0x00
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index b65cc94..db08e90 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -355,7 +355,29 @@ namespace gbe
>      0
>    };
> 
> -  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, 
> uint32_t bti, uint32_t elemNum) {
> +  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src,
> uint32_t bti, uint32_t elemNum, bool doubleio, GenRegister r) {
> +    if (doubleio) {
> +      GenRegister hdr = GenRegister::h2(r);
> +      GenRegister imm4 = GenRegister::immud(4);
> +      push();
> +        curr.execWidth = 8;
> +        MOV(hdr,
> GenRegister::ud8grf(src.nr, 0));
> +        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> +        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(src.nr,
> 4));
> +        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 
> + 1),
> imm4);
> +      pop();
> +      UNTYPED_READ(dst, hdr, bti, elemNum, false, r);
> +
> +      push();
> +        curr.execWidth = 8;
> +        MOV(hdr,
> GenRegister::ud8grf(src.nr + 1, 0));
> +        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> +        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(src.nr +
> 1, 4));
> +        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 
> + 1),
> imm4);
> +      pop();
> +      UNTYPED_READ(GenRegister::offset(dst, 2), hdr, bti, elemNum, 
> + false,
> r);
> +      return;
> +    }
>      GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4);
>      uint32_t msg_length = 0;
> @@ -382,7 +404,32 @@ namespace gbe
>                     response_length);
>    }
> 
> -  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, 
> uint32_t elemNum) {
> +  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti,
> uint32_t elemNum, bool doubleio, GenRegister r) {
> +    if (doubleio) {
> +      GenRegister hdr = GenRegister::h2(r);
> +      GenRegister data = GenRegister::offset(r, 2);
> +      GenRegister imm4 = GenRegister::immud(4);
> +      push();
> +        curr.execWidth = 8;
> +        MOV(hdr,
> GenRegister::ud8grf(msg.nr, 0));
> +        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> +        MOV(GenRegister::offset(hdr, 1),    GenRegister::ud8grf(msg.nr,
> 4));
> +        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 
> + 1),
> imm4);
> +      pop();
> +      MOV(data, GenRegister::ud16grf(msg.nr+2, 0));
> +      UNTYPED_WRITE(hdr, bti, elemNum, false, r);
> +
> +      push();
> +        curr.execWidth = 8;
> +        MOV(hdr,
> GenRegister::ud8grf(msg.nr+1, 0));
> +        ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> +        MOV(GenRegister::offset(hdr, 1),
> GenRegister::ud8grf(msg.nr+1, 4));
> +        ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 
> + 1),
> imm4);
> +      pop();
> +      MOV(data, GenRegister::ud16grf(msg.nr+4, 0));
> +      UNTYPED_WRITE(hdr, bti, elemNum, false, r);
> +      return;
> +    }
>      GenInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4);
>      uint32_t msg_length = 0;
> @@ -467,7 +514,17 @@ namespace gbe
>    }
> 
>    INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, 
> GenRegister src) {
> -     if (needToSplitAlu1(p, dst, src) == false) {
> +     if (opcode != GEN_OPCODE_MOV && dst.isdf() && src.isdf()) {
> +       GenInstruction *insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, dst);
> +       p->setSrc0(insn, src);
> +       int w = p->curr.execWidth / 4;
> +       insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, GenRegister::QnPhysical(dst, w));
> +       p->setSrc0(insn, GenRegister::QnPhysical(src, w));
> +     } else if (needToSplitAlu1(p, dst, src) == false) {
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
>         p->setDst(insn, dst);
> @@ -499,7 +556,19 @@ namespace gbe
>                     GenRegister src0,
>                     GenRegister src1)
>    {
> -    if (needToSplitAlu2(p, dst, src0, src1) == false) {
> +    if (dst.isdf() && src0.isdf() && src1.isdf()) {
> +       GenInstruction *insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, dst);
> +       p->setSrc0(insn, src0);
> +       p->setSrc1(insn, src1);
> +       int w = p->curr.execWidth / 4;
> +       insn = p->next(opcode);
> +       p->setHeader(insn);
> +       p->setDst(insn, GenRegister::QnPhysical(dst, w));
> +       p->setSrc0(insn, GenRegister::QnPhysical(src0, w));
> +       p->setSrc1(insn, GenRegister::QnPhysical(src1, w));
> +    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
>         p->setDst(insn, dst);
> @@ -620,7 +689,35 @@ namespace gbe
>      alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
>    }
> 
> -  ALU1(MOV)
> +  void GenEncoder::MOV(GenRegister dest, GenRegister src0, bool 
> + doubleio,
> GenRegister r) {
> +    if (doubleio) {
> +      GenRegister r0 = GenRegister::h2(r);
> +      push();
> +        curr.execWidth = 8;
> +        if(src0.isimmdf()) {
> +          union { double d; unsigned u[2]; } u;
> +          u.d = src0.value.df;
> +          r0 = GenRegister::retype(r0, GEN_TYPE_UD);
> +          GenRegister imm0 = GenRegister::immud(u.u[0]);
> +          GenRegister imm1 = GenRegister::immud(u.u[1]);
> +          MOV(r0, imm0);
> +          MOV(GenRegister::suboffset(r0, 1), imm1);
> +          MOV(GenRegister::offset(r0, 1), imm0);
> +          MOV(GenRegister::suboffset(GenRegister::offset(r0, 1), 1),
> imm1);
> +        } else {
> +          MOV(r0, src0);
> +          MOV(GenRegister::offset(r0, 1), GenRegister::offset(src0, 
> + 0,
> 16));
> +        }
> +      pop();
> +      if(src0.isimmdf())
> +        r = GenRegister::retype(r, GEN_TYPE_DF);
> +      MOV(dest, r);
> +      MOV(GenRegister::offset(dest, 2), r);
> +    } else {
> +      alu1(this, GEN_OPCODE_MOV, dest, src0);
> +    }
> +  }
> +
>    ALU1(RNDZ)
>    ALU1(RNDE)
>    ALU1(RNDD)
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index 83d83d2..3dc55b6 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -89,7 +89,7 @@ namespace gbe
>  #define ALU1(OP) void OP(GenRegister dest, GenRegister src0);  
> #define
> ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister 
> src1); #define ALU3(OP) void OP(GenRegister dest, GenRegister src0, 
> GenRegister src1, GenRegister src2);
> -    ALU1(MOV)
> +    void MOV(GenRegister dest, GenRegister src0, bool doubleio = 
> + false, GenRegister r = GenRegister());
>      ALU1(RNDZ)
>      ALU1(RNDE)
>      ALU1(RNDD)
> @@ -131,9 +131,9 @@ namespace gbe
>      /*! Wait instruction (used for the barrier) */
>      void WAIT(void);
>      /*! Untyped read (upto 4 channels) */
> -    void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti,
> uint32_t elemNum);
> +    void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, 
> + uint32_t elemNum, bool doubleio, GenRegister r);
>      /*! Untyped write (upto 4 channels) */
> -    void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
> +    void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t 
> + elemNum, bool doubleio, GenRegister r);
>      /*! Byte gather (for unaligned bytes, shorts and ints) */
>      void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, 
> uint32_t elemSize);
>      /*! Byte scatter (for unaligned bytes, shorts and ints) */ diff 
> --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 88f9e94..6a845cf 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -129,6 +129,7 @@ namespace gbe
>        case TYPE_S32: return GEN_TYPE_D;
>        case TYPE_U32: return GEN_TYPE_UD;
>        case TYPE_FLOAT: return GEN_TYPE_F;
> +      case TYPE_DOUBLE: return GEN_TYPE_DF;
>        default: NOT_SUPPORTED; return GEN_TYPE_F;
>      }
>    }
> @@ -151,7 +152,7 @@ namespace gbe
>
///////////////////////////////////////////////////////////////////////////
> 
>    SelectionInstruction::SelectionInstruction(SelectionOpcode op, 
> uint32_t
dst,
> uint32_t src) :
> -    parent(NULL), opcode(op), dstNum(dst), srcNum(src)
> +    parent(NULL), opcode(op), dstNum(dst), srcNum(src), 
> + doubleio(false)
>    {}
> 
>    void SelectionInstruction::prepend(SelectionInstruction &other) { 
> @@
> -448,9 +449,9 @@ namespace gbe
>      /*! Wait instruction (used for the barrier) */
>      void WAIT(void);
>      /*! Untyped read (up to 4 elements) */
> -    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t
> elemNum, uint32_t bti);
> +    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t 
> + elemNum, uint32_t bti, bool doubleio);
>      /*! Untyped write (up to 4 elements) */
> -    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t
> elemNum, uint32_t bti);
> +    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t 
> + elemNum, uint32_t bti, bool doubleio);
>      /*! Byte gather (for unaligned bytes, shorts and ints) */
>      void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
>      /*! Byte scatter (for unaligned bytes, shorts and ints) */ @@ 
> -655,6
> +656,7 @@ namespace gbe
>        case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
>        case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
>        case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
> +      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
>        default: NOT_SUPPORTED;
>      }
>      GBE_ASSERT(false);
> @@ -715,7 +717,8 @@ namespace gbe
>    void Selection::Opaque::UNTYPED_READ(Reg addr,
>                                         const GenRegister *dst,
>                                         uint32_t elemNum,
> -                                       uint32_t bti)
> +                                       uint32_t bti,
> +                                       bool doubleio)
>    {
>      SelectionInstruction *insn = 
> this->appendInsn(SEL_OP_UNTYPED_READ,
> elemNum, 1);
>      SelectionVector *srcVector = this->appendVector(); @@ -727,11 
> +730,13 @@ namespace gbe
>      insn->src(0) = addr;
>      insn->extra.function = bti;
>      insn->extra.elem = elemNum;
> +    insn->doubleio = doubleio;
> 
>      // Sends require contiguous allocation
>      dstVector->regNum = elemNum;
>      dstVector->isSrc = 0;
>      dstVector->reg = &insn->dst(0);
> +    dstVector->doubleio = doubleio;
> 
>      // Source cannot be scalar (yet)
>      srcVector->regNum = 1;
> @@ -742,7 +747,8 @@ namespace gbe
>    void Selection::Opaque::UNTYPED_WRITE(Reg addr,
>                                          const GenRegister *src,
>                                          uint32_t elemNum,
> -                                        uint32_t bti)
> +                                        uint32_t bti,
> +                                        bool doubleio)
>    {
>      SelectionInstruction *insn = 
> this->appendInsn(SEL_OP_UNTYPED_WRITE,
> 0, elemNum+1);
>      SelectionVector *vector = this->appendVector(); @@ -753,11 
> +759,13 @@ namespace gbe
>        insn->src(elemID+1) = src[elemID];
>      insn->extra.function = bti;
>      insn->extra.elem = elemNum;
> +    insn->doubleio = doubleio;
> 
>      // Sends require contiguous allocation for the sources
>      vector->regNum = elemNum+1;
>      vector->reg = &insn->src(0);
>      vector->isSrc = 1;
> +    vector->doubleio = doubleio;
>    }
> 
>    void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t 
> elemSize, uint32_t bti) { @@ -1085,6 +1093,15 @@ namespace gbe
>    // Implementation of all patterns
>
///////////////////////////////////////////////////////////////////////////
> 
> +  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
> +    using namespace ir;
> +    const auto &childInsn = cast<LoadImmInstruction>(insn);
> +    const auto &imm = childInsn.getImmediate();
> +    if(imm.type != TYPE_DOUBLE)
> +      return true;
> +    return false;
> +  }
> +
>    GenRegister getRegisterFromImmediate(ir::Immediate imm)
>    {
>      using namespace ir;
> @@ -1096,6 +1113,7 @@ namespace gbe
>        case TYPE_S16: return  GenRegister::immw(imm.data.s16);
>        case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
>        case TYPE_S8:  return GenRegister::immw(imm.data.s8);
> +      case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
>        default: NOT_SUPPORTED; return GenRegister::immuw(0);
>      }
>    }
> @@ -1218,14 +1236,14 @@ namespace gbe
>        SelectionDAG *dag1 = dag.child[1];
> 
>        // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> dag1->insn.getOpcode() == OP_LOADI) {
> +      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> + dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
>          src0 = sel.selReg(insn.getSrc(0), type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
>          if (dag0) dag0->isRoot = 1;
>        }
>        // Left source cannot be immediate but it is OK if we can commute
> -      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
> +      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> + insn.commutes() && dag0->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag0->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
>          src0 = sel.selReg(insn.getSrc(1), type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
> @@ -1261,7 +1279,7 @@ namespace gbe
>          case OP_SHR: sel.SHR(dst, src0, src1); break;
>          case OP_ASR: sel.ASR(dst, src0, src1); break;
>          case OP_MUL:
> -          if (type == TYPE_FLOAT)
> +          if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
>              sel.MUL(dst, src0, src1);
>            else if (type == TYPE_U32 || type == TYPE_S32) {
>              sel.pop();
> @@ -1592,6 +1610,7 @@ namespace gbe
>          case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); 
> break;
>          case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.data.u8)); 
> break;
>          case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.data.s8)); 
> break;
> +        case TYPE_DOUBLE: sel.MOV(dst, 
> + GenRegister::immdf(imm.data.f64)); break;
>          default: NOT_SUPPORTED;
>        }
>        sel.pop();
> @@ -1639,6 +1658,7 @@ namespace gbe
>    INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
>      using namespace ir;
>      switch (type) {
> +      case TYPE_DOUBLE:
>        case TYPE_FLOAT:
>        case TYPE_U32:
>        case TYPE_S32:
> @@ -1665,9 +1685,10 @@ namespace gbe
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
>        vector<GenRegister> dst(valueNum);
> +      bool doubleio = insn.getValueType() == TYPE_DOUBLE ? true :
> + false;
>        for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
>          dst[dstID] =
GenRegister::retype(sel.selReg(insn.getValue(dstID)),
> GEN_TYPE_F);
> -      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
> +      sel.UNTYPED_READ(addr, dst.data(), valueNum, bti, doubleio);
>      }
> 
>      void emitByteGather(Selection::Opaque &sel, @@ -1744,11 +1765,12 
> @@ namespace gbe
>        const uint32_t addrID = ir::StoreInstruction::addressIndex;
>        GenRegister addr;
>        vector<GenRegister> value(valueNum);
> +      bool doubleio = insn.getValueType() == TYPE_DOUBLE ? true :
> + false;
> 
>        addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)),
> GEN_TYPE_F);;
>        for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
>          value[valueID] =
> GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
> -      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
> +      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti, doubleio);
>      }
> 
>      void emitByteScatter(Selection::Opaque &sel, @@ -1828,7 +1850,7 
> @@ namespace gbe
>        SelectionDAG *dag1 = dag.child[1];
> 
>        // Right source can always be an immediate
> -      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> dag1->insn.getOpcode() == OP_LOADI) {
> +      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> + dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
>          const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
>          src0 = sel.selReg(insn.getSrc(0), type);
>          src1 = getRegisterFromImmediate(childInsn.getImmediate());
> @@ -1862,7 +1884,7 @@ namespace gbe
>        const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
> 
>        // We need two instructions to make the conversion
> -      if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
> +      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && 
> + srcFamily == FAMILY_DWORD) {
>          GenRegister unpacked;
>          if (dstFamily == FAMILY_WORD) {
>            const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W; 
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 778eb1f..5f7289d 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -116,6 +116,8 @@ namespace gbe
>      uint8_t srcNum:5;
>      /*! To store various indices */
>      uint16_t index;
> +    /*! Double size I/O ? */
> +    bool doubleio;
>      /*! Variable sized. Destinations and sources go here */
>      GenRegister regs[0];
>    private:
> @@ -138,6 +140,8 @@ namespace gbe
>      uint16_t regNum;
>      /*! Indicate if this a destination or a source vector */
>      uint16_t isSrc;
> +    /*! "double IO" requires two more register */
> +    bool doubleio;
>    };
> 
>    // Owns the selection block
> diff --git a/backend/src/backend/gen_reg_allocation.cpp
> b/backend/src/backend/gen_reg_allocation.cpp
> index 469be12..6b63c41 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -454,7 +454,6 @@ namespace gbe
>    }
> 
>    bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
> -
>      // Perform the linear scan allocator
>      const uint32_t regNum = ctx.sel->getRegNum();
>      for (uint32_t startID = 0; startID < regNum; ++startID) { @@ 
> -472,7
> +471,9 @@ namespace gbe
>          const SelectionVector *vector = it->second.first;
>          const uint32_t simdWidth = ctx.getSimdWidth();
>          const uint32_t alignment = simdWidth * sizeof(uint32_t);
> -        const uint32_t size = vector->regNum * alignment;
> +        uint32_t size = vector->regNum * alignment;
> +        if (vector->doubleio)
> +          size += alignment;
>          uint32_t grfOffset;
>          while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
>            const bool success = this->expireGRF(interval); @@ -667,6
> +668,11 @@ namespace gbe
>      // First we try to put all booleans registers into flags
>      this->allocateFlags(selection);
> 
> +    int w = ctx.getSimdWidth();
> +    int offst = ctx.allocate(w * sizeof(int) * 2, w * sizeof(int));
> +    GBE_ASSERT(offst != 0);
> +    RA.insert(std::make_pair(ocl::doubleio, offst));
> +
>      // Allocate all the GRFs now (regular register and boolean that 
> are
not in
>      // flag registers)
>      return this->allocateGRFs(selection); diff --git 
> a/backend/src/backend/gen_register.hpp
> b/backend/src/backend/gen_register.hpp
> index d772b0d..5870b07 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -192,6 +192,7 @@ namespace gbe
> 
>      /*! For immediates or virtual register */
>      union {
> +      double df;
>        float f;
>        int32_t d;
>        uint32_t ud;
> @@ -211,6 +212,31 @@ namespace gbe
>      uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
>      uint32_t address_mode:1; //!< direct or indirect
> 
> +    static INLINE GenRegister offset(GenRegister reg, int nr, int 
> + subnr =
0) {
> +      GenRegister r = reg;
> +      r.nr += nr;
> +      r.subnr += subnr;
> +      return r;
> +    }
> +
> +    INLINE bool isimmdf(void) const {
> +      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
> +        return true;
> +      return false;
> +    }
> +
> +    INLINE bool isdf(void) const {
> +      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
> +        return true;
> +      return false;
> +    }
> +
> +    static INLINE GenRegister h2(GenRegister reg) {
> +      GenRegister r = reg;
> +      r.hstride = GEN_HORIZONTAL_STRIDE_2;
> +      return r;
> +    }
> +
>      static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t
quarter) {
>        GBE_ASSERT(reg.physical == 0);
>        if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register 
> @@
> -293,6 +319,18 @@ namespace gbe
>        return reg;
>      }
> 
> +    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
> +      return retype(vec16(file, reg), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
> +      return retype(vec8(file, reg), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
> +      return retype(vec1(file, reg), GEN_TYPE_DF);
> +    }
> +
>      static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
>        return retype(vec16(file, reg), GEN_TYPE_UD);
>      }
> @@ -371,6 +409,12 @@ namespace gbe
>                           GEN_HORIZONTAL_STRIDE_0);
>      }
> 
> +    static INLINE GenRegister immdf(double df) {
> +      GenRegister immediate = imm(GEN_TYPE_DF);
> +      immediate.value.df = df;
> +      return immediate;
> +    }
> +
>      static INLINE GenRegister immf(float f) {
>        GenRegister immediate = imm(GEN_TYPE_F);
>        immediate.value.f = f;
> @@ -448,6 +492,18 @@ namespace gbe
>        return vec16(GEN_GENERAL_REGISTER_FILE, reg);
>      }
> 
> +    static INLINE GenRegister df1grf(ir::Register reg) {
> +      return df1(GEN_GENERAL_REGISTER_FILE, reg);
> +    }
> +
> +    static INLINE GenRegister df8grf(ir::Register reg) {
> +      return df8(GEN_GENERAL_REGISTER_FILE, reg);
> +    }
> +
> +    static INLINE GenRegister df16grf(ir::Register reg) {
> +      return df16(GEN_GENERAL_REGISTER_FILE, reg);
> +    }
> +
>      static INLINE GenRegister ud16grf(ir::Register reg) {
>        return ud16(GEN_GENERAL_REGISTER_FILE, reg);
>      }
> @@ -613,6 +669,18 @@ namespace gbe
>        return reg;
>      }
> 
> +    static INLINE GenRegister df16(uint32_t file, uint32_t nr, 
> + uint32_t
subnr)
> {
> +      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df8(uint32_t file, uint32_t nr, 
> + uint32_t
subnr) {
> +      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
> +    }
> +
> +    static INLINE GenRegister df1(uint32_t file, uint32_t nr, 
> + uint32_t
subnr) {
> +      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
> +    }
> +
>      static INLINE GenRegister ud16(uint32_t file, uint32_t nr, 
> uint32_t
subnr)
> {
>        return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
>      }
> @@ -685,6 +753,18 @@ namespace gbe
>        return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
>      }
> 
> +    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
> +      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> +    }
> +
> +    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
> +      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> +    }
> +
> +    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
> +      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> +    }
> +
>      static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
>        return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
>      }
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp 
> index 99cd06c..9a17f1d 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -41,6 +41,7 @@ namespace ir {
>          "block_ip",
>          "barrier_id", "thread_number",
>          "const_curbe_offset",
> +        "double_io",
>      };
> 
>  #if GBE_DEBUG
> @@ -77,6 +78,7 @@ namespace ir {
>        DECL_NEW_REG(FAMILY_DWORD, threadn);
>        DECL_NEW_REG(FAMILY_DWORD, constoffst);
>        DECL_NEW_REG(FAMILY_DWORD, workdim);
> +      DECL_NEW_REG(FAMILY_DWORD, doubleio);
>      }
>  #undef DECL_NEW_REG
> 
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp 
> index 4b0ef5e..029b47c 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -65,7 +65,8 @@ namespace ir {
>      static const Register threadn = Register(21);  // number of threads
>      static const Register constoffst = Register(22); // offset of 
> global constant array's curbe
>      static const Register workdim = Register(23);  // work dimention.
> -    static const uint32_t regNum = 24;             // number of special
> registers
> +    static const Register doubleio = Register(24);  // work dimention.
> +    static const uint32_t regNum = 25;             // number of special
> registers
>      extern const char *specialRegMean[];           // special register
> name.
>    } /* namespace ocl */
> 
> --
> 1.8.1.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet