[Beignet] [PATCH 1/2] padding enough temporary registers in 64-bit IO

Zhigang Gong zhigang.gong at linux.intel.com
Mon Jul 29 20:43:01 PDT 2013


On Wed, Jul 24, 2013 at 10:25:39AM +0800, Homer Hsing wrote:
> Temporary registers are required in 64-bit data type IO.
> 
> Previously padded QWORD register was allocated as DWORD.
This indicates a bug at the register allocation functions.
And I took a look at the code, found the vector allocation
function assumes that all the element are DWORD. We should
fix it there.

> Here split QWORD into DWORD regs.
> 
> In 64-bit reading, temporary registers is next to the dest reg.
> 
> In 64-bit writing, temporary registers is 2*GEN_REG_SIZE
> away from the message reg.
> ---
>  backend/src/backend/gen_encoder.cpp        |  6 +++---
>  backend/src/backend/gen_insn_selection.cpp | 18 ++++++++++++------
>  2 files changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
> index f84c6dd..ee58a7d 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -367,10 +367,10 @@ namespace gbe
>  
>    void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
>      int w = curr.execWidth;
> -    dst = GenRegister::h2(dst);
>      dst.type = GEN_TYPE_UD;
>      src.type = GEN_TYPE_UD;
> -    GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
> +    GenRegister r = GenRegister::retype(GenRegister::suboffset(dst, w*2), GEN_TYPE_UD);
> +    dst = GenRegister::h2(dst);
>      GenRegister imm4 = GenRegister::immud(4);
>      GenInstruction *insn;
>      insn = next(GEN_OPCODE_SEND);
> @@ -410,7 +410,7 @@ namespace gbe
>  
>    void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
>      int w = curr.execWidth;
> -    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
> +    GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*4), GEN_TYPE_UD);
>      r.type = GEN_TYPE_UD;
>      GenRegister hdr = GenRegister::h2(r);
>      GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index d4be8bf..e3470ea 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -1874,9 +1874,12 @@ namespace gbe
>        vector<GenRegister> dst(valueNum);
>        for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
>          dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
Why do you set the value register's type as GEN_TYPE_F? Should we use GEN_TYPE_DF here?
otherwise, at register allocation stage, it will not allocate enough register space for
the value.

> -      dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> -      if (sel.ctx.getSimdWidth() == 16)
> -        dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> +      dst.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +      dst.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +      if (sel.ctx.getSimdWidth() == 16) {
> +        dst.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +        dst.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +      }
>        sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
>      }
>  
> @@ -1976,9 +1979,12 @@ namespace gbe
>        addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
>        for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
>          value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
The same comment as above, should we use GEN_TYPE_DF?
> -      value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> -      if (sel.ctx.getSimdWidth() == 16)
> -        value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
> +      value.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +      value.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +      if (sel.ctx.getSimdWidth() == 16) {
> +        value.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +        value.push_back(sel.selReg(sel.reg(FAMILY_DWORD)));
> +      }

I have another major comment as below, take the READ_FLOAT64
as an example:

  void Selection::Opaque::READ_FLOAT64(Reg addr,
                                       const GenRegister *dst,
                                       uint32_t elemNum,
                                       uint32_t bti)
  {
    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
    SelectionVector *srcVector = this->appendVector();
    SelectionVector *dstVector = this->appendVector();

    // Regular instruction to encode
    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
      insn->dst(elemID) = dst[elemID];
    insn->src(0) = addr;
    insn->extra.function = bti;
    insn->extra.elem = elemNum;

    // Sends require contiguous allocation
    dstVector->regNum = elemNum;
    dstVector->isSrc = 0;
    dstVector->reg = &insn->dst(0);

    // Source cannot be scalar (yet)
    srcVector->regNum = 1;
    srcVector->isSrc = 1;
    srcVector->reg = &insn->src(0);
  }

The dst register array here contains one value register, and the other is one or two
temporary registers (QWORD). Here, IMO, they don't need to be in contiguous together.
Another side effect is that if you put all of them into one vector then the temporary
registeres will get the same lifetime intervals as the value register thus we will waste
some registers during the value's lifetime.

And we also never need to create a vector for only one register. So the above srcVector
is also could be removed.


More information about the Beignet mailing list