[Beignet] [PATCH 1/3] GBE: Change 64bit integer storage in register

Mon May 26 20:50:59 PDT 2014

could you rebase this patchset to the latest git master version?
Thanks.

On Thu, May 22, 2014 at 03:06:52PM +0800, Ruiling Song wrote:
> Previously, we store low/high half of 64bit together, which need several
> 32bit instructions to do one 64bit instruction. Now we simply change its
> storage in register, low 32bit of all lanes are stored together, and then the
> high 32bit of all lanes. This will make long support cleaner and less
> 32bit instructions needed.
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/backend/gen_context.cpp        |  226 +++++-----------------------
>  backend/src/backend/gen_encoder.cpp        |   96 +-----------
>  backend/src/backend/gen_encoder.hpp        |    6 +-
>  backend/src/backend/gen_insn_selection.cpp |   83 +++++-----
>  backend/src/backend/gen_reg_allocation.cpp |   33 ++--
>  backend/src/backend/gen_register.hpp       |   25 +--
>  backend/src/llvm/llvm_gen_backend.cpp      |    5 +
>  7 files changed, 124 insertions(+), 350 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index f4c80e3..25f690a 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -213,17 +213,7 @@ namespace gbe
>        case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
>        case SEL_OP_CONVI64_TO_I:
>         {
> -        int execWidth = p->curr.execWidth;
> -        GenRegister xsrc = src.bottom_half(), xdst = dst;
> -        p->push();
> -        p->curr.execWidth = 8;
> -        for(int i = 0; i < execWidth/4; i ++) {
> -          p->curr.chooseNib(i);
> -          p->MOV(xdst, xsrc);
> -          xdst = GenRegister::suboffset(xdst, 4);
> -          xsrc = GenRegister::suboffset(xsrc, 4);
> -        }
> -        p->pop();
> +        p->MOV(dst, src.bottom_half());
>          break;
>         }
>        case SEL_OP_BRC:
> @@ -268,28 +258,18 @@ namespace gbe
>          p->MOV_DF(dst, src, tmp);
>          break;
>        case SEL_OP_CONVI_TO_I64: {
> -        GenRegister middle;
> -        if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
> +        GenRegister middle = src;
> +        if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
>            middle = tmp;
> -          middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
> +          middle.type = GEN_TYPE_D;
>            p->MOV(middle, src);
> -        } else {
> -          middle = src;
>          }
> -        int execWidth = p->curr.execWidth;
> -        p->push();
> -        p->curr.execWidth = 8;
> -        for (int nib = 0; nib < execWidth / 4; nib ++) {
> -          p->curr.chooseNib(nib);
> -          p->MOV(dst.bottom_half(), middle);
> -          if(middle.is_signed_int())
> -            p->ASR(dst.top_half(), middle, GenRegister::immud(31));
> -          else
> -            p->MOV(dst.top_half(), GenRegister::immd(0));
> -          dst = GenRegister::suboffset(dst, 4);
> -          middle = GenRegister::suboffset(middle, 4);
> -        }
> -        p->pop();
> +
> +        p->MOV(dst.bottom_half(), middle);
> +        if(src.is_signed_int())
> +          p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
> +        else
> +          p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
>          break;
>        }
>        default:
> @@ -304,8 +284,10 @@ namespace gbe
>      GenRegister tmp = ra->genReg(insn.dst(1));
>      switch (insn.opcode) {
>        case SEL_OP_I64ADD: {
> -        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
> -                    y = GenRegister::suboffset(x, p->curr.execWidth);
> +        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
> +        GenRegister x = tmp.bottom_half();
> +        GenRegister y = tmp.top_half(this->simdWidth);
> +
>          loadBottomHalf(x, src0);
>          loadBottomHalf(y, src1);
>          addWithCarry(x, x, y);
> @@ -318,8 +300,10 @@ namespace gbe
>          break;
>        }
>        case SEL_OP_I64SUB: {
> -        GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
> -                    y = GenRegister::suboffset(x, p->curr.execWidth);
> +        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
> +        GenRegister x = tmp.bottom_half();
> +        GenRegister y = tmp.top_half(this->simdWidth);
> +
>          loadBottomHalf(x, src0);
>          loadBottomHalf(y, src1);
>          subWithBorrow(x, x, y);
> @@ -400,21 +384,8 @@ namespace gbe
>        case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
>        case SEL_OP_SEL_INT64:
>          {
> -          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> -                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> -                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> -          int execWidth = p->curr.execWidth;
> -          p->push();
> -          p->curr.execWidth = 8;
> -          for (int nib = 0; nib < execWidth / 4; nib ++) {
> -            p->curr.chooseNib(nib);
> -            p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> -            p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> -            xdst = GenRegister::suboffset(xdst, 4);
> -            xsrc0 = GenRegister::suboffset(xsrc0, 4);
> -            xsrc1 = GenRegister::suboffset(xsrc1, 4);
> -          }
> -          p->pop();
> +          p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> +          p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
>          }
>          break;
>        case SEL_OP_AND:  p->AND(dst, src0, src1, insn.extra.function); break;
> @@ -422,59 +393,20 @@ namespace gbe
>        case SEL_OP_XOR:  p->XOR(dst, src0, src1, insn.extra.function); break;
>        case SEL_OP_I64AND:
>          {
> -          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> -                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> -                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> -          int execWidth = p->curr.execWidth;
> -          p->push();
> -          p->curr.execWidth = 8;
> -          for (int nib = 0; nib < execWidth / 4; nib ++) {
> -            p->curr.chooseNib(nib);
> -            p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> -            p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> -            xdst = GenRegister::suboffset(xdst, 4),
> -            xsrc0 = GenRegister::suboffset(xsrc0, 4),
> -            xsrc1 = GenRegister::suboffset(xsrc1, 4);
> -          }
> -          p->pop();
> +          p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> +          p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
>          }
>          break;
>        case SEL_OP_I64OR:
>          {
> -          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> -                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> -                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> -          int execWidth = p->curr.execWidth;
> -          p->push();
> -          p->curr.execWidth = 8;
> -          for (int nib = 0; nib < execWidth / 4; nib ++) {
> -            p->curr.chooseNib(nib);
> -            p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> -            p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> -            xdst = GenRegister::suboffset(xdst, 4),
> -            xsrc0 = GenRegister::suboffset(xsrc0, 4),
> -            xsrc1 = GenRegister::suboffset(xsrc1, 4);
> -          }
> -          p->pop();
> +          p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> +          p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
>          }
>          break;
>        case SEL_OP_I64XOR:
>          {
> -          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> -                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> -                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> -          int execWidth = p->curr.execWidth;
> -          p->push();
> -          p->curr.execWidth = 8;
> -          for (int nib = 0; nib < execWidth / 4; nib ++) {
> -            p->curr.chooseNib(nib);
> -            p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> -            p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> -            xdst = GenRegister::suboffset(xdst, 4),
> -            xsrc0 = GenRegister::suboffset(xsrc0, 4),
> -            xsrc1 = GenRegister::suboffset(xsrc1, 4);
> -          }
> -          p->pop();
> +          p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> +          p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
>          }
>          break;
>        case SEL_OP_SHR:  p->SHR(dst, src0, src1); break;
> @@ -492,18 +424,8 @@ namespace gbe
>            GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
>                        xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
>                        xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> -          int execWidth = p->curr.execWidth;
> -          p->push();
> -          p->curr.execWidth = 8;
> -          for (int nib = 0; nib < execWidth / 4; nib ++) {
> -            p->curr.chooseNib(nib);
> -            p->MOV(xdst.top_half(), xsrc0.bottom_half());
> -            p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
> -            xdst = GenRegister::suboffset(xdst, 4);
> -            xsrc0 = GenRegister::suboffset(xsrc0, 4);
> -            xsrc1 = GenRegister::suboffset(xsrc1, 4);
> -          }
> -          p->pop();
> +          p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
> +          p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
>          }
>          break;
>        default: NOT_IMPLEMENTED;
> @@ -511,16 +433,10 @@ namespace gbe
>    }
>  
>    void GenContext::collectShifter(GenRegister dest, GenRegister src) {
> -    int execWidth = p->curr.execWidth;
>      p->push();
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    p->curr.noMask = 1;
> -    p->curr.execWidth = 8;
> -    for (int nib = 0; nib < execWidth / 4; nib ++) {
> -      p->AND(dest, src.bottom_half(), GenRegister::immud(63));
> -      dest = GenRegister::suboffset(dest, 4);
> -      src = GenRegister::suboffset(src, 4);
> -    }
> +      p->curr.predicate = GEN_PREDICATE_NONE;
> +      p->curr.noMask = 1;
> +    p->AND(dest, src.bottom_half(), GenRegister::immud(63));
>      p->pop();
>    }
>  
> @@ -1267,73 +1183,19 @@ namespace gbe
>    }
>  
>    void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
> -    int execWidth = p->curr.execWidth;
> -    src = src.top_half();
> -    p->push();
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    p->curr.noMask = 1;
> -    p->curr.execWidth = 8;
> -    p->MOV(dest, src);
> -    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> -    if (execWidth == 16) {
> -      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> -      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> -    }
> -    p->pop();
> +    p->MOV(dest, src.top_half(this->simdWidth));
>    }
>  
>    void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
> -    int execWidth = p->curr.execWidth;
> -    dest = dest.top_half();
> -    p->push();
> -    p->curr.noMask = 0;
> -    p->curr.execWidth = 8;
> -    p->MOV(dest, src);
> -    p->curr.nibControl = 1;
> -    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> -    if (execWidth == 16) {
> -      p->curr.quarterControl = 1;
> -      p->curr.nibControl = 0;
> -      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> -      p->curr.nibControl = 1;
> -      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> -    }
> -    p->pop();
> +    p->MOV(dest.top_half(this->simdWidth), src);
>    }
>  
>    void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
> -    int execWidth = p->curr.execWidth;
> -    src = src.bottom_half();
> -    p->push();
> -    p->curr.predicate = GEN_PREDICATE_NONE;
> -    p->curr.noMask = 1;
> -    p->curr.execWidth = 8;
> -    p->MOV(dest, src);
> -    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> -    if (execWidth == 16) {
> -      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> -      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> -    }
> -    p->pop();
> +    p->MOV(dest, src.bottom_half());
>    }
>  
>    void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
> -    int execWidth = p->curr.execWidth;
> -    dest = dest.bottom_half();
> -    p->push();
> -    p->curr.execWidth = 8;
> -    p->curr.noMask = 0;
> -    p->MOV(dest, src);
> -    p->curr.nibControl = 1;
> -    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> -    if (execWidth == 16) {
> -      p->curr.quarterControl = 1;
> -      p->curr.nibControl = 0;
> -      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> -      p->curr.nibControl = 1;
> -      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> -    }
> -    p->pop();
> +    p->MOV(dest.bottom_half(), src);
>    }
>  
>    void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
> @@ -1770,18 +1632,12 @@ namespace gbe
>      p->pop();
>    }
>  
> -  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
> -  //  then follow the real destination registers.
> -  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
>    void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
>      const uint32_t elemNum = insn.extra.elem;
> -    const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
> -    const GenRegister tempAddr = ra->genReg(insn.dst(tmpRegSize + 1));
> -    const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
> -    const GenRegister tmp = ra->genReg(insn.dst(0));
> +    const GenRegister dst = ra->genReg(insn.dst(0));
>      const GenRegister src = ra->genReg(insn.src(0));
>      const uint32_t bti = insn.extra.function;
> -    p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
> +    p->UNTYPED_READ(dst, src, bti, elemNum*2);
>    }
>  
>    void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
> @@ -1792,17 +1648,11 @@ namespace gbe
>      p->UNTYPED_READ(dst, src, bti, elemNum);
>    }
>  
> -  //  For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
> -  //  then follow the real destination registers.
> -  //  For SIMD16, we allocate elemNum temporary registers from dst(0).
>    void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
>      const GenRegister src = ra->genReg(insn.dst(0));
>      const uint32_t elemNum = insn.extra.elem;
> -    const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
> -    const GenRegister data = ra->genReg(insn.src(1));
>      const uint32_t bti = insn.extra.function;
> -    p->MOV(src, addr);
> -    p->WRITE64(src, data, bti, elemNum, sel->isScalarReg(data.reg()));
> +    p->UNTYPED_WRITE(src, bti, elemNum*2);
>    }
>  
>    void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
> diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
> index 7078dcb..7ecb4c4 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -216,6 +216,7 @@ namespace gbe
>    GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID, int jump_width) :
>      stateNum(0), gen(gen), deviceID(deviceID), jump_width(jump_width)
>    {
> +    this->simdWidth = simdWidth;
>      this->curr.execWidth = simdWidth;
>      this->curr.quarterControl = GEN_COMPRESSION_Q1;
>      this->curr.noMask = 0;
> @@ -370,76 +371,6 @@ namespace gbe
>      0
>    };
>  
> -  void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
> -    GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
> -    src = GenRegister::retype(src, GEN_TYPE_UD);
> -    addr = GenRegister::retype(addr, GEN_TYPE_UD);
> -    tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
> -    uint32_t originSimdWidth = curr.execWidth;
> -    uint32_t originPredicate = curr.predicate;
> -    uint32_t originMask = curr.noMask;
> -    push();
> -    for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
> -          channels < originSimdWidth; channels += 8, currQuarter++) {
> -      curr.predicate = GEN_PREDICATE_NONE;
> -      curr.noMask = GEN_MASK_DISABLE;
> -      curr.execWidth = 8;
> -      /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
> -         which is what we want here. */
> -      MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
> -      ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
> -      MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
> -      ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
> -      // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
> -      curr.execWidth = 16;
> -      this->UNTYPED_READ(tmp, addr, bti, elemNum);
> -      if (originSimdWidth == 16)
> -        curr.quarterControl = currQuarter;
> -      curr.predicate = originPredicate;
> -      curr.noMask = originMask;
> -      // Back to simd8 for correct predication flag.
> -      curr.execWidth = 8;
> -      MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
> -    }
> -    pop();
> -  }
> -
> -  void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
> -    GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
> -    GenRegister unpacked;
> -    msg = GenRegister::retype(msg, GEN_TYPE_UD);
> -    int originSimdWidth = curr.execWidth;
> -    int originPredicate = curr.predicate;
> -    int originMask = curr.noMask;
> -    push();
> -    for (uint32_t half = 0; half < 2; half++) {
> -      curr.predicate = GEN_PREDICATE_NONE;
> -      curr.noMask = GEN_MASK_DISABLE;
> -      curr.execWidth = 8;
> -      if (is_scalar) {
> -        unpacked = data32;
> -        unpacked.subnr += half * 4;
> -      } else
> -        unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
> -      MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
> -      if (originSimdWidth == 16) {
> -        if (is_scalar) {
> -          unpacked = data32;
> -          unpacked.subnr += half * 4;
> -        } else
> -          unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
> -        MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
> -        curr.execWidth = 16;
> -      }
> -      if (half == 1)
> -        ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
> -      curr.predicate = originPredicate;
> -      curr.noMask = originMask;
> -      this->UNTYPED_WRITE(msg, bti, elemNum);
> -    }
> -    pop();
> -  }
> -
>    void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
>      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
>      assert(elemNum >= 1 || elemNum <= 4);
> @@ -672,17 +603,8 @@ namespace gbe
>       if (dst.isdf() && src.isdf()) {
>         handleDouble(p, opcode, dst, src);
>       } else if (dst.isint64() && src.isint64()) { // handle int64
> -       int execWidth = p->curr.execWidth;
> -       p->push();
> -       p->curr.execWidth = 8;
> -       for (int nib = 0; nib < execWidth / 4; nib ++) {
> -         p->curr.chooseNib(nib);
> -         p->MOV(dst.bottom_half(), src.bottom_half());
> -         p->MOV(dst.top_half(), src.top_half());
> -         dst = GenRegister::suboffset(dst, 4);
> -         src = GenRegister::suboffset(src, 4);
> -       }
> -       p->pop();
> +       p->MOV(dst.bottom_half(), src.bottom_half());
> +       p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
>       } else if (needToSplitAlu1(p, dst, src) == false) {
>        if(compactAlu1(p, opcode, dst, src, condition, false))
>          return;
> @@ -915,16 +837,8 @@ namespace gbe
>  
>    void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
>      GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
> -    int execWidth = curr.execWidth;
> -    push();
> -    curr.execWidth = 8;
> -    for(int nib = 0; nib < execWidth/4; nib ++) {
> -      curr.chooseNib(nib);
> -      MOV(dest.top_half(), u1);
> -      MOV(dest.bottom_half(), u0);
> -      dest = GenRegister::suboffset(dest, 4);
> -    }
> -    pop();
> +    MOV(dest.bottom_half(), u0);
> +    MOV(dest.top_half(this->simdWidth), u1);
>    }
>  
>    void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
> diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
> index e0bb4cc..627a311 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -90,6 +90,8 @@ namespace gbe
>      uint32_t deviceID;
>      /*! The constant for jump. */
>      const int jump_width;
> +    /*! simd width for this codegen */
> +    uint32_t simdWidth;
>      ////////////////////////////////////////////////////////////////////////
>      // Encoding functions
>      ////////////////////////////////////////////////////////////////////////
> @@ -168,10 +170,6 @@ namespace gbe
>      void WAIT(void);
>      /*! Atomic instructions */
>      virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
> -    /*! Read 64-bits float/int arrays */
> -    void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
> -    /*! Write 64-bits float/int arrays */
> -    void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
>      /*! Untyped read (upto 4 channels) */
>      virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
>      /*! Untyped write (upto 4 channels) */
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 2ab3aae..3f7154f 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -536,9 +536,9 @@ namespace gbe
>      /*! Atomic instruction */
>      void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
>      /*! Read 64 bits float/int array */
> -    void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
> +    void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
>      /*! Write 64 bits float/int array */
> -    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
> +    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
>      /*! Untyped read (up to 4 elements) */
>      void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
>      /*! Untyped write (up to 4 elements) */
> @@ -1033,34 +1033,27 @@ namespace gbe
>    void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
>    void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
>  
> -  /* elemNum contains all the temporary register and the
> -     real destination registers.*/
>    void Selection::Opaque::READ64(Reg addr,
> -                                 Reg tempAddr,
>                                   const GenRegister *dst,
>                                   uint32_t elemNum,
> -                                 uint32_t valueNum,
>                                   uint32_t bti)
>    {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
>      SelectionVector *srcVector = this->appendVector();
>      SelectionVector *dstVector = this->appendVector();
>  
>      // Regular instruction to encode
>      for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
>        insn->dst(elemID) = dst[elemID];
> -    /* temporary addr register is to be modified, set it to dst registers.*/
> -    insn->dst(elemNum) = tempAddr;
> +
>      insn->src(0) = addr;
>      insn->extra.function = bti;
> -    insn->extra.elem = valueNum;
> +    insn->extra.elem = elemNum;
>  
> -    // Only the temporary registers need contiguous allocation
> -    dstVector->regNum = elemNum - valueNum;
> +    dstVector->regNum = elemNum;
>      dstVector->isSrc = 0;
>      dstVector->reg = &insn->dst(0);
>  
> -    // Source cannot be scalar (yet)
>      srcVector->regNum = 1;
>      srcVector->isSrc = 1;
>      srcVector->reg = &insn->src(0);
> @@ -1087,36 +1080,30 @@ namespace gbe
>      dstVector->regNum = elemNum;
>      dstVector->isSrc = 0;
>      dstVector->reg = &insn->dst(0);
> -    // Source cannot be scalar (yet)
> +
>      srcVector->regNum = 1;
>      srcVector->isSrc = 1;
>      srcVector->reg = &insn->src(0);
>    }
>  
> -  /* elemNum contains all the temporary register and the
> -     real data registers.*/
>    void Selection::Opaque::WRITE64(Reg addr,
>                                    const GenRegister *src,
>                                    uint32_t srcNum,
> -                                  const GenRegister *dst,
> -                                  uint32_t dstNum,
>                                    uint32_t bti)
>    {
> -    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
> +    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
>      SelectionVector *vector = this->appendVector();
>  
>      // Regular instruction to encode
>      insn->src(0) = addr;
>      for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
>        insn->src(elemID + 1) = src[elemID];
> -    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
> -      insn->dst(elemID) = dst[elemID];
> +
>      insn->extra.function = bti;
>      insn->extra.elem = srcNum;
>  
> -    // Only the addr + temporary registers need to be contiguous.
> -    vector->regNum = dstNum;
> -    vector->reg = &insn->dst(0);
> +    vector->regNum = srcNum + 1;
> +    vector->reg = &insn->src(0);
>      vector->isSrc = 1;
>    }
>  
> @@ -2643,18 +2630,13 @@ namespace gbe
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
> -      uint32_t dstID;
>        /* XXX support scalar only right now. */
>        GBE_ASSERT(valueNum == 1);
>  
> -      // The first 16 DWORD register space is for temporary usage at encode stage.
> -      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
> -      GenRegister dst[valueNum + tmpRegNum];
> -      for (dstID = 0; dstID < tmpRegNum ; ++dstID)
> -        dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
> -      for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
> -        dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
> -      sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
> +      GenRegister dst[valueNum];
> +      for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
> +        dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
> +      sel.READ64(addr, dst, valueNum, bti);
>      }
>  
>      void emitByteGather(Selection::Opaque &sel,
> @@ -2803,22 +2785,14 @@ namespace gbe
>      {
>        using namespace ir;
>        const uint32_t valueNum = insn.getValueNum();
> -      uint32_t srcID;
>        /* XXX support scalar only right now. */
>        GBE_ASSERT(valueNum == 1);
> -      addr = GenRegister::retype(addr, GEN_TYPE_F);
> -      // The first 16 DWORD register space is for temporary usage at encode stage.
> -      uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
> +      addr = GenRegister::retype(addr, GEN_TYPE_UD);
>        GenRegister src[valueNum];
> -      GenRegister dst[tmpRegNum + 1];
> -      /* dst 0 is for the temporary address register. */
> -      dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
> -      for (srcID = 0; srcID < tmpRegNum; ++srcID)
> -        dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
>  
>        for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
>          src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
> -      sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
> +      sel.WRITE64(addr, src, valueNum, bti);
>      }
>  
>      void emitByteScatter(Selection::Opaque &sel,
> @@ -3009,6 +2983,11 @@ namespace gbe
>          narrowNum = srcNum;
>          narrowDst = 0;
>        }
> +      // As we store long/ulong low/high part separately,
> +      // we need to deal with it separately, we need to change it back again
> +      // when hardware support native long type.
> +      const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
> +      const int simdWidth = sel.curr.execWidth;
>  
>        for(int i = 0; i < narrowNum; i++, index++) {
>          GenRegister narrowReg, wideReg;
> @@ -3030,16 +3009,26 @@ namespace gbe
>              GBE_ASSERT(multiple == 8);
>            }
>          }
> -        if(index % multiple) {
> +
> +        if(!isInt64 && index % multiple) {
>            wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
>            wideReg.subphysical = 1;
>          }
> +        if(isInt64) {
> +          // offset to next half
> +          wideReg.subphysical = 1;
> +          if(i >= multiple/2)
> +            wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
> +          if(index % (multiple/2))
> +            wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
> +        }
> +
>          GenRegister xdst = narrowDst ? narrowReg : wideReg;
>          GenRegister xsrc = narrowDst ? wideReg : narrowReg;
>  
> -        if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == TYPE_DOUBLE) ||
> -           (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == TYPE_DOUBLE)) {
> -          const int simdWidth = sel.curr.execWidth;
> +        if(isInt64) {
> +          sel.MOV(xdst, xsrc);
> +        } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
>            sel.push();
>              sel.curr.execWidth = 8;
>              xdst.subphysical = 1;
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 880a267..bab65e5 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -661,25 +661,38 @@ namespace gbe
>             != spilledRegs.end())
>            continue;
>  
> -        uint32_t alignment;
> -        ir::RegisterFamily family;
> -        getRegAttrib(reg, alignment, &family);
> -        const uint32_t size = vector->regNum * alignment;
> -        const uint32_t grfOffset = allocateReg(interval, size, alignment);
> +        uint32_t alignment, maxAlignment = 0;
> +        uint32_t size = 0;
> +        for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
> +          getRegAttrib(reg, alignment, NULL);
> +          if(alignment > maxAlignment)
> +            maxAlignment = alignment;
> +          size += alignment;
> +        }
> +
> +        const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
>          if(grfOffset == 0) {
> -          GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
> +          ir::RegisterFamily family;
>            for(int i = vector->regNum-1; i >= 0; i--) {
> +            family = ctx.sel->getRegisterFamily(vector->reg[i].reg());
> +            // we currently only support DWORD/QWORD spill
> +            if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
> +              return false;
>              if (!spillReg(vector->reg[i].reg()))
>                return false;
>            }
>            continue;
>          }
> +        uint32_t subOffset = 0;
>          for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
>            const ir::Register reg = vector->reg[regID].reg();
> -          GBE_ASSERT(RA.contains(reg) == false
> -                     && ctx.sel->getRegisterData(reg).family == family);
> -          insertNewReg(reg, grfOffset + alignment * regID, true);
> -          ctx.splitBlock(grfOffset, alignment * regID);  //splitBlock will not split if regID == 0
> +          GBE_ASSERT(RA.contains(reg) == false);
> +          getRegAttrib(reg, alignment, NULL);
> +          // check all sub registers aligned correctly
> +          GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
> +          insertNewReg(reg, grfOffset + subOffset, true);
> +          ctx.splitBlock(grfOffset, subOffset);  //splitBlock will not split if regID == 0
> +          subOffset += alignment;
>          }
>        }
>        // Case 2: This is a regular scalar register, allocate it alone
> diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
> index 50a6dcd..3967e6e 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -301,20 +301,25 @@ namespace gbe
>        return false;
>      }
>  
> -    INLINE GenRegister top_half(void) const {
> -      GenRegister r = bottom_half();
> -      r.subnr += 4;
> -      r.nr += r.subnr / 32;
> -      r.subnr %= 32;
> -      return r;
> +    INLINE GenRegister top_half(int simdWidth) const {
> +      GBE_ASSERT(isint64());
> +      GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
> +
> +      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
> +        reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
> +        reg.nr += reg.subnr / 32;
> +        reg.subnr %= 32;
> +      } else {
> +        reg.subnr += typeSize(reg.type);
> +        reg.nr += reg.subnr/32;
> +        reg.subnr %= 32;
> +      }
> +      return reg;
>      }
>  
>      INLINE GenRegister bottom_half(void) const {
>        GBE_ASSERT(isint64());
> -      GenRegister r = h2(*this);
> -      r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
> -      if(r.vstride != GEN_VERTICAL_STRIDE_0)
> -       r.vstride = GEN_VERTICAL_STRIDE_16;
> +      GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
>        return r;
>      }
>  
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 82429d0..bd111b0 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -1868,6 +1868,11 @@ namespace gbe
>          uint32_t srcElemNum = 0, dstElemNum = 0 ;
>          ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
>          ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
> +        // As long and double are not compatible in register storage
> +        // and we do not support double yet, simply put an assert here
> +        GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
> +        GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
> +
>          if(srcElemNum > 1 || dstElemNum > 1) {
>            // Build the tuple data in the vector
>            vector<ir::Register> srcTupleData;
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet