[Beignet] [PATCH] GBE: refactor double support.

Mon Aug 5 01:19:30 PDT 2013

Yes. I will do the rebase work.

-----Original Message-----
From: Zhigang Gong [mailto:zhigang.gong at gmail.com] 
Sent: Monday, August 5, 2013 4:09 PM
To: Xing, Homer
Cc: beignet at lists.freedesktop.org
Subject: Re: [Beignet] [PATCH] GBE: refactor double support.

Homer,

Did you aware of that this patch conflicts your 64 bit vector allocation patch and all your 64 bit int patchset?

I will ignore your 64bit vector allocation patch, as it was already handled in this patch. And all your other patchset need to be rebased to this patch?

Are you ok with that?

On Mon, Aug 05, 2013 at 07:30:52AM +0000, Xing, Homer wrote:
> You turned off predicate and mask in 64-bit reading/writing. And use SIMD16 data port reading for SIMD8 read. Good idea.
> 
> This patch looks good to me.
> 
> Homer
> 
> -----Original Message-----
> From: beignet-bounces+homer.xing=intel.com at lists.freedesktop.org 
> [mailto:beignet-bounces+homer.xing=intel.com at lists.freedesktop.org] On 
> Behalf Of Zhigang Gong
> Sent: Monday, August 5, 2013 11:08 AM
> To: beignet at lists.freedesktop.org
> Cc: Zhigang Gong
> Subject: [Beignet] [PATCH] GBE: refactor double support.
> 
> There are two major issues in double support:
> 1. Doesn't work at SIMD16 mode.
> 2. The incorrect usage of vectors. We only need to allocate those temporary register to contiguous registers.
> 
> If you look at the previous implementation of
> READ_FLOAT64/WRITE_FLOAT64 in gen_encoder.cpp. You can easily find it contains many duplicate code and considering the SIMD16 code path never work correctly, it's so difficult to work based on that code. So I choose to refactor those two major functions.
> And refine other parts in the instruction selection stage to fix the above two major problem with a cleaner code.
> 
> Now, it works well on both SIMD16/SIMD8 mode.
> Another minor improvement is for the READ_FLOAT64 on SIMD8 mode, this 
> patch saves one time of send instruction to read all the
> 8 double data into registers.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> ---
>  backend/src/backend/gen_context.cpp        |   21 ++-
>  backend/src/backend/gen_encoder.cpp        |  231 +++++++++++++---------------
>  backend/src/backend/gen_encoder.hpp        |    7 +-
>  backend/src/backend/gen_insn_selection.cpp |   64 +++++---
>  backend/src/backend/gen_reg_allocation.cpp |   10 +-
>  backend/src/backend/gen_register.hpp       |   25 +++
>  backend/src/llvm/llvm_gen_backend.cpp      |    4 +-
>  7 files changed, 200 insertions(+), 162 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp 
> b/backend/src/backend/gen_context.cpp
> index e33d8da..655b1d7 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -354,12 +354,18 @@ namespace gbe
>      p->pop();
>    }
>  
> +  //  For SIMD8, we allocate 2*elemNum temporary registers from 
> + dst(0), and  //  then follow the real destination registers.
> +  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
>    void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
> -    const GenRegister dst = ra->genReg(insn.dst(0));
> +    const uint32_t elemNum = insn.extra.elem;
> +    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
> +    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
> +    const GenRegister tmp = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0));
> +    const GenRegister tempAddr = ra->genReg(insn.src(1));
>      const uint32_t bti = insn.extra.function;
> -    const uint32_t elemNum = insn.extra.elem;
> -    p->READ_FLOAT64(dst, src, bti, elemNum);
> +    p->READ_FLOAT64(dst, tmp, tempAddr, src, bti, elemNum);
>    }
>  
>    void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) { @@ -370,11 +376,16 @@ namespace gbe
>      p->UNTYPED_READ(dst, src, bti, elemNum);
>    }
>  
> +  //  For SIMD8, we allocate 2*elemNum temporary registers from 
> + dst(0), and  //  then follow the real destination registers.
> +  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
>    void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.src(0));
> -    const uint32_t bti = insn.extra.function;
>      const uint32_t elemNum = insn.extra.elem;
> -    p->WRITE_FLOAT64(src, bti, elemNum);
> +    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
> +    const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
> +    const uint32_t bti = insn.extra.function;
> +    p->WRITE_FLOAT64(src, data, bti, elemNum);
>    }
>  
>    void GenContext::emitUntypedWriteInstruction(const 
> SelectionInstruction &insn) { diff --git 
> a/backend/src/backend/gen_encoder.cpp 
> b/backend/src/backend/gen_encoder.cpp
> index f84c6dd..5930926 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -356,103 +356,69 @@ namespace gbe
>      0
>    };
>  
> -  static int dst_type(int exec_width) {
> -    if (exec_width == 8)
> -      return GEN_TYPE_UD;
> -    if (exec_width == 16)
> -      return GEN_TYPE_UW;
> -    NOT_IMPLEMENTED;
> -    return 0;
> -  }
> -
> -  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
> -    int w = curr.execWidth;
> -    dst = GenRegister::h2(dst);
> -    dst.type = GEN_TYPE_UD;
> -    src.type = GEN_TYPE_UD;
> -    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
> -    GenRegister imm4 = GenRegister::immud(4);
> -    GenInstruction *insn;
> -    insn = next(GEN_OPCODE_SEND);
> -    setHeader(insn);
> -    setDst(insn, GenRegister::uw16grf(r.nr, 0));
> -    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    setSrc1(insn, GenRegister::immud(0));
> -    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
> -    push();
> -    curr.quarterControl = 0;
> -    curr.nibControl = 0;
> -    MOV(dst, r);
> -    if (w == 8)
> -      curr.nibControl = 1;
> -    else
> -      curr.quarterControl = 1;
> -    MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
> -    pop();
> -    ADD(src, src, imm4);
> -    insn = next(GEN_OPCODE_SEND);
> -    setHeader(insn);
> -    setDst(insn, GenRegister::uw16grf(r.nr, 0));
> -    setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
> -    setSrc1(insn, GenRegister::immud(0));
> -    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
> +  void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
> +    GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
> +    src = GenRegister::retype(src, GEN_TYPE_UD);
> +    addr = GenRegister::retype(addr, GEN_TYPE_UD);
> +    tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
> +    uint32_t originSimdWidth = curr.execWidth;
> +    uint32_t originPredicate = curr.predicate;
> +    uint32_t originMask = curr.noMask;
>      push();
> -    curr.quarterControl = 0;
> -    curr.nibControl = 0;
> -    MOV(GenRegister::suboffset(dst, 1), r);
> -    if (w == 8)
> -      curr.nibControl = 1;
> -    else
> -      curr.quarterControl = 1;
> -    MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
> +    for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
> +          channels < originSimdWidth; channels += 8, currQuarter++) {
> +      curr.predicate = GEN_PREDICATE_NONE;
> +      curr.noMask = GEN_MASK_DISABLE;
> +      curr.execWidth = 8;
> +      /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
> +         which is what we want here. */
> +      MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
> +      ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
> +      MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
> +      ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
> +      // get the first 8 doubles.
> +      curr.execWidth = 16;
> +      this->UNTYPED_READ(tmp, addr, bti, elemNum);
> +      if (originSimdWidth == 16)
> +        curr.quarterControl = currQuarter;
> +      curr.predicate = originPredicate;
> +      curr.noMask = originMask;
> +      // Back to simd8 for correct predication flag.
> +      curr.execWidth = 8;
> +      MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
> +    }
>      pop();
>    }
>  
> -  void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
> -    int w = curr.execWidth;
> -    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
> -    r.type = GEN_TYPE_UD;
> -    GenRegister hdr = GenRegister::h2(r);
> -    GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
> -    src.hstride = GEN_HORIZONTAL_STRIDE_2;
> -    GenRegister data = GenRegister::offset(r, w / 8);
> -    GenRegister imm4 = GenRegister::immud(4);
> -    MOV(r, GenRegister::ud8grf(msg.nr, 0));
> +  void GenEncoder::WRITE_FLOAT64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
> +    GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
> +    msg = GenRegister::retype(msg, GEN_TYPE_UD);
> +    int originSimdWidth = curr.execWidth;
> +    int originPredicate = curr.predicate;
> +    int originMask = curr.noMask;
>      push();
> -    curr.quarterControl = 0;
> -    curr.nibControl = 0;
> -    MOV(data, src);
> -    if (w == 8)
> -      curr.nibControl = 1;
> -    else
> -      curr.quarterControl = 1;
> -    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
> -    pop();
> -    GenInstruction *insn;
> -    insn = next(GEN_OPCODE_SEND);
> -    setHeader(insn);
> -    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
> -    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
> -    setSrc1(insn, GenRegister::immud(0));
> -    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
> -
> -    ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
> -    push();
> -    curr.quarterControl = 0;
> -    curr.nibControl = 0;
> -    MOV(data, GenRegister::suboffset(src, 1));
> -    if (w == 8)
> -      curr.nibControl = 1;
> -    else
> -      curr.quarterControl = 1;
> -    MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
> +    for (uint32_t half = 0; half < 2; half++) {
> +      curr.predicate = GEN_PREDICATE_NONE;
> +      curr.noMask = GEN_MASK_DISABLE;
> +      curr.execWidth = 8;
> +      MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
> +      if (originSimdWidth == 16) {
> +        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
> +        curr.execWidth = 16;
> +      }
> +      if (half == 1)
> +        ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
> +      curr.predicate = originPredicate;
> +      curr.noMask = originMask;
> +      this->UNTYPED_WRITE(msg, bti, elemNum);
> +    }
> +    /* Let's restore the original message(addr) register. */
> +    /* XXX could be optimized if we don't allocate the address to the header
> +       position of the message. */
> +    curr.predicate = GEN_PREDICATE_NONE;
> +    curr.noMask = GEN_MASK_DISABLE;
> +    ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD), 
> + GenRegister::immd(-4));
>      pop();
> -    insn = next(GEN_OPCODE_SEND);
> -    setHeader(insn);
> -    setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
> -    setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
> -    setSrc1(insn, GenRegister::immud(0));
> -    setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
>    }
>  
>    void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) { @@ -470,7 +436,7 @@ namespace gbe
>        NOT_IMPLEMENTED;
>  
>      this->setHeader(insn);
> -    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
> +    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
>      this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
>      this->setSrc1(insn, GenRegister::immud(0));
>      setDPUntypedRW(this,
> @@ -601,25 +567,53 @@ namespace gbe
>       return &this->store.back();
>    }
>  
> -  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
> -     if (dst.isdf() && src.isdf()) {
> +  INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
> +                            GenRegister src0, GenRegister src1 =
> + GenRegister::null()) {
>         int w = p->curr.execWidth;
>         p->push();
> -       p->curr.quarterControl = 0;
>         p->curr.nibControl = 0;
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
>         p->setDst(insn, dst);
> -       p->setSrc0(insn, src);
> +       p->setSrc0(insn, src0);
> +       if (!GenRegister::isNull(src1))
> +         p->setSrc1(insn, src1);
>         if (w == 8)
>           p->curr.nibControl = 1; // second 1/8 mask
> -       else // w == 16
> -         p->curr.quarterControl = 1; // second 1/4 mask
>         insn = p->next(opcode);
>         p->setHeader(insn);
>         p->setDst(insn, GenRegister::suboffset(dst, w / 2));
> -       p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
> +       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
> +       if (!GenRegister::isNull(src1))
> +         p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
>         p->pop();
> +  }
> +
> +  // Double register accessing is a little special,  // Per Gen spec, 
> + then only supported mode is SIMD8 and, it only  // handles four 
> + doubles each time.
> +  // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8  
> + // to two SIMD1x4.
> +  INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
> +                           GenRegister src0, GenRegister src1 = GenRegister::null()) {
> +      if (p->curr.execWidth == 8)
> +        _handleDouble(p, opcode, dst, src0, src1);
> +      else if (p->curr.execWidth == 16) {
> +        p->push();
> +        p->curr.execWidth = 8;
> +        p->curr.quarterControl = GEN_COMPRESSION_Q1;
> +        _handleDouble(p, opcode, dst, src0, src1);
> +        p->curr.quarterControl = GEN_COMPRESSION_Q2;
> +        if (!GenRegister::isNull(src1))
> +          src1 = GenRegister::offset(src1, 2);
> +        _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
> +        p->pop();
> +      }
> +  }
> +
> +  INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
> +     if (dst.isdf() && src.isdf()) {
> +       handleDouble(p, opcode, dst, src);
>       } else if (needToSplitAlu1(p, dst, src) == false) {
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
> @@ -653,25 +647,7 @@ namespace gbe
>                     GenRegister src1)
>    {
>      if (dst.isdf() && src0.isdf() && src1.isdf()) {
> -       int w = p->curr.execWidth;
> -       p->push();
> -       p->curr.quarterControl = 0;
> -       p->curr.nibControl = 0;
> -       GenInstruction *insn = p->next(opcode);
> -       p->setHeader(insn);
> -       p->setDst(insn, dst);
> -       p->setSrc0(insn, src0);
> -       p->setSrc1(insn, src1);
> -       if (w == 8)
> -         p->curr.nibControl = 1; // second 1/8 mask
> -       else // w == 16
> -         p->curr.quarterControl = 1; // second 1/4 mask
> -       insn = p->next(opcode);
> -       p->setHeader(insn);
> -       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
> -       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
> -       p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
> -       p->pop();
> +       handleDouble(p, opcode, dst, src0, src1);
>      } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
>         GenInstruction *insn = p->next(opcode);
>         p->setHeader(insn);
> @@ -808,7 +784,16 @@ namespace gbe
>      r.width = GEN_WIDTH_1;
>      r.hstride = GEN_HORIZONTAL_STRIDE_0;
>      push();
> +    uint32_t width = curr.execWidth;
> +    curr.execWidth = 8;
> +    curr.predicate = GEN_PREDICATE_NONE;
> +    curr.noMask = 1;
> +    curr.quarterControl = GEN_COMPRESSION_Q1;
>      MOV(dest, r);
> +    if (width == 16) {
> +      curr.quarterControl = GEN_COMPRESSION_Q2;
> +      MOV(GenRegister::offset(dest, 2), r);
> +    }
>      pop();
>    }
>  
> @@ -839,14 +824,8 @@ namespace gbe
>    void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
>      int w = curr.execWidth;
>      if (src0.isdf()) {
> -      push();
> -      curr.execWidth = 16;
> -      MOV(dest, src0);
> -      if (w == 16) {
> -        curr.quarterControl = 1;
> -        MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
> -      }
> -      pop();
> +      GBE_ASSERT(0); // MOV DF is called from convert instruction,
> +                     // We should never convert a df to a df.
>      } else {
>        GenRegister r0 = GenRegister::h2(r);
>        push();
> diff --git a/backend/src/backend/gen_encoder.hpp 
> b/backend/src/backend/gen_encoder.hpp
> index d3a7165..86e1a71 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -118,10 +118,11 @@ namespace gbe
>      ALU2(LINE)
>      ALU2(PLN)
>      ALU3(MAD)
> -    ALU2(MOV_DF);
> +    //ALU2(MOV_DF);
>  #undef ALU1
>  #undef ALU2
>  #undef ALU3
> +    void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = 
> + GenRegister::null());
>      void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
>      /*! Barrier message (to synchronize threads of a workgroup) */
>      void BARRIER(GenRegister src);
> @@ -142,9 +143,9 @@ namespace gbe
>      /*! Atomic instructions */
>      void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
>      /*! Read 64-bits float arrays */
> -    void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
> +    void READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister 
> + addr, GenRegister src, uint32_t bti, uint32_t elemNum);
>      /*! Write 64-bits float arrays */
> -    void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
> +    void WRITE_FLOAT64(GenRegister src, GenRegister data, uint32_t 
> + bti, uint32_t elemNum);
>      /*! Untyped read (upto 4 channels) */
>      void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
>      /*! Untyped write (upto 4 channels) */ diff --git 
> a/backend/src/backend/gen_insn_selection.cpp 
> b/backend/src/backend/gen_insn_selection.cpp
> index d4be8bf..727d07d 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -466,9 +466,9 @@ namespace gbe
>      /*! Atomic instruction */
>      void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
>      /*! Read 64 bits float array */
> -    void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
> +    void READ_FLOAT64(Reg addr, Reg tempAddr, const GenRegister *dst, 
> + uint32_t elemNum, uint32_t valueNum, uint32_t bti);
>      /*! Write 64 bits float array */
> -    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
> +    void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t 
> + elemNum, uint32_t valueNum, uint32_t bti);
>      /*! Untyped read (up to 4 elements) */
>      void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
>      /*! Untyped write (up to 4 elements) */ @@ -760,12 +760,16 @@ namespace gbe
>    void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
>    void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 
> 0, 0); }
>  
> +  /* elemNum contains all the temporary register and the
> +     real destination registers.*/
>    void Selection::Opaque::READ_FLOAT64(Reg addr,
> +                                       Reg tempAddr,
>                                         const GenRegister *dst,
>                                         uint32_t elemNum,
> +                                       uint32_t valueNum,
>                                         uint32_t bti)
>    {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
> +    SelectionInstruction *insn = 
> + this->appendInsn(SEL_OP_READ_FLOAT64,
> + elemNum, 2);
>      SelectionVector *srcVector = this->appendVector();
>      SelectionVector *dstVector = this->appendVector();
>  
> @@ -773,11 +777,12 @@ namespace gbe
>      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>        insn->dst(elemID) = dst[elemID];
>      insn->src(0) = addr;
> +    insn->src(1) = tempAddr;
>      insn->extra.function = bti;
> -    insn->extra.elem = elemNum;
> +    insn->extra.elem = valueNum;
>  
> -    // Sends require contiguous allocation
> -    dstVector->regNum = elemNum;
> +    // Only the temporary registers need contiguous allocation
> +    dstVector->regNum = elemNum - valueNum;
>      dstVector->isSrc = 0;
>      dstVector->reg = &insn->dst(0);
>  
> @@ -814,9 +819,12 @@ namespace gbe
>      srcVector->reg = &insn->src(0);
>    }
>  
> +  /* elemNum contains all the temporary register and the
> +     real data registers.*/
>    void Selection::Opaque::WRITE_FLOAT64(Reg addr,
>                                          const GenRegister *src,
>                                          uint32_t elemNum,
> +                                        uint32_t valueNum,
>                                          uint32_t bti)
>    {
>      SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1); @@ -827,10 +835,10 @@ namespace gbe
>      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>        insn->src(elemID+1) = src[elemID];
>      insn->extra.function = bti;
> -    insn->extra.elem = elemNum;
> +    insn->extra.elem = valueNum;
>  
> -    // Sends require contiguous allocation for the sources
> -    vector->regNum = elemNum+1;
> +    // Only the addr + temporary registers need to be contiguous.
> +    vector->regNum = (elemNum - valueNum) + 1;
>      vector->reg = &insn->src(0);
>      vector->isSrc = 1;
>    }
> @@ -1871,13 +1879,17 @@ namespace gbe
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
> -      vector<GenRegister> dst(valueNum);
> -      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
> -        dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
> -      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> -      if (sel.ctx.getSimdWidth() == 16)
> -        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> -      sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
> +      GenRegister dst[valueNum * 3];
> +      uint32_t dstID;
> +      /* XXX support scalar only right now. */
> +      GBE_ASSERT(valueNum == 1);
> +      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
> +      // The first 16 DWORD register space is for temporary usage at encode stage.
> +      for (dstID = 0; dstID < tmpRegNum ; ++dstID)
> +        dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
> +      for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
> +        dst[dstID] = sel.selReg(insn.getValue(valueID));
> +      sel.READ_FLOAT64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst, 
> + valueNum + tmpRegNum, valueNum, bti);
>      }
>  
>      void emitByteGather(Selection::Opaque &sel, @@ -1971,15 +1983,19 @@ namespace gbe
>        const uint32_t valueNum = insn.getValueNum();
>        const uint32_t addrID = ir::StoreInstruction::addressIndex;
>        GenRegister addr;
> -      vector<GenRegister> value(valueNum);
> -
> +      GenRegister src[valueNum * 2];
> +      uint32_t srcID;
> +      /* XXX support scalar only right now. */
> +      GBE_ASSERT(valueNum == 1);
>        addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
> -      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
> -        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
> -      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> -      if (sel.ctx.getSimdWidth() == 16)
> -        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> -      sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
> +      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
> +      // The first 16 DWORD register space is for temporary usage at encode stage.
> +      for (srcID = 0; srcID < tmpRegNum; ++srcID)
> +        src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
> +
> +      for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
> +        src[srcID] = sel.selReg(insn.getValue(valueID));
> +      sel.WRITE_FLOAT64(addr, src, valueNum + tmpRegNum, valueNum, 
> + bti);
>      }
>  
>      void emitByteScatter(Selection::Opaque &sel, diff --git 
> a/backend/src/backend/gen_reg_allocation.cpp 
> b/backend/src/backend/gen_reg_allocation.cpp
> index e7c96ac..4ba03ea 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -474,7 +474,12 @@ namespace gbe
>        if (it != vectorMap.end()) {
>          const SelectionVector *vector = it->second.first;
>          const uint32_t simdWidth = ctx.getSimdWidth();
> -        const uint32_t alignment = simdWidth * sizeof(uint32_t);
> +
> +        const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
> +        const ir::RegisterFamily family = regData.family;
> +        const uint32_t typeSize = familyVectorSize[family];
> +        const uint32_t alignment = simdWidth*typeSize;
> +
>          const uint32_t size = vector->regNum * alignment;
>          uint32_t grfOffset;
>          while ((grfOffset = ctx.allocate(size, alignment)) == 0) { @@ -483,7 +488,8 @@ namespace gbe
>          }
>          for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) {
>            const ir::Register reg = vector->reg[regID].reg();
> -          GBE_ASSERT(RA.contains(reg) == false);
> +          GBE_ASSERT(RA.contains(reg) == false
> +                     && ctx.sel->getRegisterData(reg).family == 
> + family);
>            RA.insert(std::make_pair(reg, grfOffset));
>          }
>        }
> diff --git a/backend/src/backend/gen_register.hpp 
> b/backend/src/backend/gen_register.hpp
> index fedb743..7e48837 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -553,6 +553,11 @@ namespace gbe
>                           GEN_HORIZONTAL_STRIDE_1);
>      }
>  
> +    static INLINE bool isNull(GenRegister reg) {
> +      return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
> +              && reg.nr == GEN_ARF_NULL);
> +    }
> +
>      static INLINE GenRegister acc(void) {
>        return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
>                           GEN_ARF_ACCUMULATOR, @@ -832,6 +837,26 @@ namespace gbe
>                           GEN_HORIZONTAL_STRIDE_2);
>      }
>  
> +    static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
> +      return GenRegister(GEN_GENERAL_REGISTER_FILE,
> +                         nr,
> +                         subnr,
> +                         GEN_TYPE_UD,
> +                         GEN_VERTICAL_STRIDE_8,
> +                         GEN_WIDTH_4,
> +                         GEN_HORIZONTAL_STRIDE_1);
> +    }
> +
> +    static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
> +      return GenRegister(GEN_GENERAL_REGISTER_FILE,
> +                         nr,
> +                         subnr,
> +                         GEN_TYPE_UD,
> +                         GEN_VERTICAL_STRIDE_8,
> +                         GEN_WIDTH_4,
> +                         GEN_HORIZONTAL_STRIDE_2);
> +    }
> +
>      static INLINE GenRegister mask(uint32_t subnr) {
>        return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
>      }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index c8c5484..b5963ad 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2371,8 +2371,8 @@ namespace gbe
>      // Scalar is easy. We neednot build register tuples
>      if (isScalarType(llvmType) == true) {
>        const ir::Type type = getType(ctx, llvmType);
> -      if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
> -        OCL_SIMD_WIDTH = 8;
> +      //if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
> +      //  OCL_SIMD_WIDTH = 8;
>        const ir::Register values = this->getRegister(llvmValues);
>        if (isLoad)
>          ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
> --
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet