[Beignet] [PATCH 1/3] GBE: Change 64bit integer storage in register
Zhigang Gong
zhigang.gong at linux.intel.com
Mon May 26 20:50:59 PDT 2014
could you rebase this patchset to the latest git master version?
Thanks.
On Thu, May 22, 2014 at 03:06:52PM +0800, Ruiling Song wrote:
> Previously, we store low/high half of 64bit together, which need several
> 32bit instructions to do one 64bit instruction. Now we simply change its
> storage in register, low 32bit of all lanes are stored together, and then the
> high 32bit of all lanes. This will make long support cleaner and less
> 32bit instructions needed.
>
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
> backend/src/backend/gen_context.cpp | 226 +++++-----------------------
> backend/src/backend/gen_encoder.cpp | 96 +-----------
> backend/src/backend/gen_encoder.hpp | 6 +-
> backend/src/backend/gen_insn_selection.cpp | 83 +++++-----
> backend/src/backend/gen_reg_allocation.cpp | 33 ++--
> backend/src/backend/gen_register.hpp | 25 +--
> backend/src/llvm/llvm_gen_backend.cpp | 5 +
> 7 files changed, 124 insertions(+), 350 deletions(-)
>
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index f4c80e3..25f690a 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -213,17 +213,7 @@ namespace gbe
> case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
> case SEL_OP_CONVI64_TO_I:
> {
> - int execWidth = p->curr.execWidth;
> - GenRegister xsrc = src.bottom_half(), xdst = dst;
> - p->push();
> - p->curr.execWidth = 8;
> - for(int i = 0; i < execWidth/4; i ++) {
> - p->curr.chooseNib(i);
> - p->MOV(xdst, xsrc);
> - xdst = GenRegister::suboffset(xdst, 4);
> - xsrc = GenRegister::suboffset(xsrc, 4);
> - }
> - p->pop();
> + p->MOV(dst, src.bottom_half());
> break;
> }
> case SEL_OP_BRC:
> @@ -268,28 +258,18 @@ namespace gbe
> p->MOV_DF(dst, src, tmp);
> break;
> case SEL_OP_CONVI_TO_I64: {
> - GenRegister middle;
> - if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
> + GenRegister middle = src;
> + if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
> middle = tmp;
> - middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD;
> + middle.type = GEN_TYPE_D;
> p->MOV(middle, src);
> - } else {
> - middle = src;
> }
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->MOV(dst.bottom_half(), middle);
> - if(middle.is_signed_int())
> - p->ASR(dst.top_half(), middle, GenRegister::immud(31));
> - else
> - p->MOV(dst.top_half(), GenRegister::immd(0));
> - dst = GenRegister::suboffset(dst, 4);
> - middle = GenRegister::suboffset(middle, 4);
> - }
> - p->pop();
> +
> + p->MOV(dst.bottom_half(), middle);
> + if(src.is_signed_int())
> + p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
> + else
> + p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
> break;
> }
> default:
> @@ -304,8 +284,10 @@ namespace gbe
> GenRegister tmp = ra->genReg(insn.dst(1));
> switch (insn.opcode) {
> case SEL_OP_I64ADD: {
> - GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
> - y = GenRegister::suboffset(x, p->curr.execWidth);
> + tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
> + GenRegister x = tmp.bottom_half();
> + GenRegister y = tmp.top_half(this->simdWidth);
> +
> loadBottomHalf(x, src0);
> loadBottomHalf(y, src1);
> addWithCarry(x, x, y);
> @@ -318,8 +300,10 @@ namespace gbe
> break;
> }
> case SEL_OP_I64SUB: {
> - GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD),
> - y = GenRegister::suboffset(x, p->curr.execWidth);
> + tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
> + GenRegister x = tmp.bottom_half();
> + GenRegister y = tmp.top_half(this->simdWidth);
> +
> loadBottomHalf(x, src0);
> loadBottomHalf(y, src1);
> subWithBorrow(x, x, y);
> @@ -400,21 +384,8 @@ namespace gbe
> case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
> case SEL_OP_SEL_INT64:
> {
> - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> - p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> - xdst = GenRegister::suboffset(xdst, 4);
> - xsrc0 = GenRegister::suboffset(xsrc0, 4);
> - xsrc1 = GenRegister::suboffset(xsrc1, 4);
> - }
> - p->pop();
> + p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> + p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
> }
> break;
> case SEL_OP_AND: p->AND(dst, src0, src1, insn.extra.function); break;
> @@ -422,59 +393,20 @@ namespace gbe
> case SEL_OP_XOR: p->XOR(dst, src0, src1, insn.extra.function); break;
> case SEL_OP_I64AND:
> {
> - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->AND(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> - p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> - xdst = GenRegister::suboffset(xdst, 4),
> - xsrc0 = GenRegister::suboffset(xsrc0, 4),
> - xsrc1 = GenRegister::suboffset(xsrc1, 4);
> - }
> - p->pop();
> + p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> + p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
> }
> break;
> case SEL_OP_I64OR:
> {
> - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->OR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> - p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> - xdst = GenRegister::suboffset(xdst, 4),
> - xsrc0 = GenRegister::suboffset(xsrc0, 4),
> - xsrc1 = GenRegister::suboffset(xsrc1, 4);
> - }
> - p->pop();
> + p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> + p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
> }
> break;
> case SEL_OP_I64XOR:
> {
> - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), xsrc1.bottom_half());
> - p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half());
> - xdst = GenRegister::suboffset(xdst, 4),
> - xsrc0 = GenRegister::suboffset(xsrc0, 4),
> - xsrc1 = GenRegister::suboffset(xsrc1, 4);
> - }
> - p->pop();
> + p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
> + p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
> }
> break;
> case SEL_OP_SHR: p->SHR(dst, src0, src1); break;
> @@ -492,18 +424,8 @@ namespace gbe
> GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
> xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
> xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->MOV(xdst.top_half(), xsrc0.bottom_half());
> - p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
> - xdst = GenRegister::suboffset(xdst, 4);
> - xsrc0 = GenRegister::suboffset(xsrc0, 4);
> - xsrc1 = GenRegister::suboffset(xsrc1, 4);
> - }
> - p->pop();
> + p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
> + p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
> }
> break;
> default: NOT_IMPLEMENTED;
> @@ -511,16 +433,10 @@ namespace gbe
> }
>
> void GenContext::collectShifter(GenRegister dest, GenRegister src) {
> - int execWidth = p->curr.execWidth;
> p->push();
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->AND(dest, src.bottom_half(), GenRegister::immud(63));
> - dest = GenRegister::suboffset(dest, 4);
> - src = GenRegister::suboffset(src, 4);
> - }
> + p->curr.predicate = GEN_PREDICATE_NONE;
> + p->curr.noMask = 1;
> + p->AND(dest, src.bottom_half(), GenRegister::immud(63));
> p->pop();
> }
>
> @@ -1267,73 +1183,19 @@ namespace gbe
> }
>
> void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
> - int execWidth = p->curr.execWidth;
> - src = src.top_half();
> - p->push();
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->curr.execWidth = 8;
> - p->MOV(dest, src);
> - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> - if (execWidth == 16) {
> - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> - }
> - p->pop();
> + p->MOV(dest, src.top_half(this->simdWidth));
> }
>
> void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
> - int execWidth = p->curr.execWidth;
> - dest = dest.top_half();
> - p->push();
> - p->curr.noMask = 0;
> - p->curr.execWidth = 8;
> - p->MOV(dest, src);
> - p->curr.nibControl = 1;
> - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> - if (execWidth == 16) {
> - p->curr.quarterControl = 1;
> - p->curr.nibControl = 0;
> - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> - p->curr.nibControl = 1;
> - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> - }
> - p->pop();
> + p->MOV(dest.top_half(this->simdWidth), src);
> }
>
> void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
> - int execWidth = p->curr.execWidth;
> - src = src.bottom_half();
> - p->push();
> - p->curr.predicate = GEN_PREDICATE_NONE;
> - p->curr.noMask = 1;
> - p->curr.execWidth = 8;
> - p->MOV(dest, src);
> - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> - if (execWidth == 16) {
> - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> - }
> - p->pop();
> + p->MOV(dest, src.bottom_half());
> }
>
> void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
> - int execWidth = p->curr.execWidth;
> - dest = dest.bottom_half();
> - p->push();
> - p->curr.execWidth = 8;
> - p->curr.noMask = 0;
> - p->MOV(dest, src);
> - p->curr.nibControl = 1;
> - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
> - if (execWidth == 16) {
> - p->curr.quarterControl = 1;
> - p->curr.nibControl = 0;
> - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
> - p->curr.nibControl = 1;
> - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
> - }
> - p->pop();
> + p->MOV(dest.bottom_half(), src);
> }
>
> void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
> @@ -1770,18 +1632,12 @@ namespace gbe
> p->pop();
> }
>
> - // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
> - // then follow the real destination registers.
> - // For SIMD16, we allocate elemNum temporary registers from dst(0).
> void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
> const uint32_t elemNum = insn.extra.elem;
> - const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
> - const GenRegister tempAddr = ra->genReg(insn.dst(tmpRegSize + 1));
> - const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
> - const GenRegister tmp = ra->genReg(insn.dst(0));
> + const GenRegister dst = ra->genReg(insn.dst(0));
> const GenRegister src = ra->genReg(insn.src(0));
> const uint32_t bti = insn.extra.function;
> - p->READ64(dst, tmp, tempAddr, src, bti, elemNum);
> + p->UNTYPED_READ(dst, src, bti, elemNum*2);
> }
>
> void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
> @@ -1792,17 +1648,11 @@ namespace gbe
> p->UNTYPED_READ(dst, src, bti, elemNum);
> }
>
> - // For SIMD8, we allocate 2*elemNum temporary registers from dst(0), and
> - // then follow the real destination registers.
> - // For SIMD16, we allocate elemNum temporary registers from dst(0).
> void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
> const GenRegister src = ra->genReg(insn.dst(0));
> const uint32_t elemNum = insn.extra.elem;
> - const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1));
> - const GenRegister data = ra->genReg(insn.src(1));
> const uint32_t bti = insn.extra.function;
> - p->MOV(src, addr);
> - p->WRITE64(src, data, bti, elemNum, sel->isScalarReg(data.reg()));
> + p->UNTYPED_WRITE(src, bti, elemNum*2);
> }
>
> void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
> diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
> index 7078dcb..7ecb4c4 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -216,6 +216,7 @@ namespace gbe
> GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID, int jump_width) :
> stateNum(0), gen(gen), deviceID(deviceID), jump_width(jump_width)
> {
> + this->simdWidth = simdWidth;
> this->curr.execWidth = simdWidth;
> this->curr.quarterControl = GEN_COMPRESSION_Q1;
> this->curr.noMask = 0;
> @@ -370,76 +371,6 @@ namespace gbe
> 0
> };
>
> - void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
> - GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
> - src = GenRegister::retype(src, GEN_TYPE_UD);
> - addr = GenRegister::retype(addr, GEN_TYPE_UD);
> - tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
> - uint32_t originSimdWidth = curr.execWidth;
> - uint32_t originPredicate = curr.predicate;
> - uint32_t originMask = curr.noMask;
> - push();
> - for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
> - channels < originSimdWidth; channels += 8, currQuarter++) {
> - curr.predicate = GEN_PREDICATE_NONE;
> - curr.noMask = GEN_MASK_DISABLE;
> - curr.execWidth = 8;
> - /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
> - which is what we want here. */
> - MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
> - ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
> - MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
> - ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
> - // Let's use SIMD16 to read all bytes for 8 doubles data at one time.
> - curr.execWidth = 16;
> - this->UNTYPED_READ(tmp, addr, bti, elemNum);
> - if (originSimdWidth == 16)
> - curr.quarterControl = currQuarter;
> - curr.predicate = originPredicate;
> - curr.noMask = originMask;
> - // Back to simd8 for correct predication flag.
> - curr.execWidth = 8;
> - MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
> - }
> - pop();
> - }
> -
> - void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar) {
> - GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
> - GenRegister unpacked;
> - msg = GenRegister::retype(msg, GEN_TYPE_UD);
> - int originSimdWidth = curr.execWidth;
> - int originPredicate = curr.predicate;
> - int originMask = curr.noMask;
> - push();
> - for (uint32_t half = 0; half < 2; half++) {
> - curr.predicate = GEN_PREDICATE_NONE;
> - curr.noMask = GEN_MASK_DISABLE;
> - curr.execWidth = 8;
> - if (is_scalar) {
> - unpacked = data32;
> - unpacked.subnr += half * 4;
> - } else
> - unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half);
> - MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked);
> - if (originSimdWidth == 16) {
> - if (is_scalar) {
> - unpacked = data32;
> - unpacked.subnr += half * 4;
> - } else
> - unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half);
> - MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked);
> - curr.execWidth = 16;
> - }
> - if (half == 1)
> - ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
> - curr.predicate = originPredicate;
> - curr.noMask = originMask;
> - this->UNTYPED_WRITE(msg, bti, elemNum);
> - }
> - pop();
> - }
> -
> void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
> GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
> assert(elemNum >= 1 || elemNum <= 4);
> @@ -672,17 +603,8 @@ namespace gbe
> if (dst.isdf() && src.isdf()) {
> handleDouble(p, opcode, dst, src);
> } else if (dst.isint64() && src.isint64()) { // handle int64
> - int execWidth = p->curr.execWidth;
> - p->push();
> - p->curr.execWidth = 8;
> - for (int nib = 0; nib < execWidth / 4; nib ++) {
> - p->curr.chooseNib(nib);
> - p->MOV(dst.bottom_half(), src.bottom_half());
> - p->MOV(dst.top_half(), src.top_half());
> - dst = GenRegister::suboffset(dst, 4);
> - src = GenRegister::suboffset(src, 4);
> - }
> - p->pop();
> + p->MOV(dst.bottom_half(), src.bottom_half());
> + p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
> } else if (needToSplitAlu1(p, dst, src) == false) {
> if(compactAlu1(p, opcode, dst, src, condition, false))
> return;
> @@ -915,16 +837,8 @@ namespace gbe
>
> void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
> GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
> - int execWidth = curr.execWidth;
> - push();
> - curr.execWidth = 8;
> - for(int nib = 0; nib < execWidth/4; nib ++) {
> - curr.chooseNib(nib);
> - MOV(dest.top_half(), u1);
> - MOV(dest.bottom_half(), u0);
> - dest = GenRegister::suboffset(dest, 4);
> - }
> - pop();
> + MOV(dest.bottom_half(), u0);
> + MOV(dest.top_half(this->simdWidth), u1);
> }
>
> void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
> diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
> index e0bb4cc..627a311 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -90,6 +90,8 @@ namespace gbe
> uint32_t deviceID;
> /*! The constant for jump. */
> const int jump_width;
> + /*! simd width for this codegen */
> + uint32_t simdWidth;
> ////////////////////////////////////////////////////////////////////////
> // Encoding functions
> ////////////////////////////////////////////////////////////////////////
> @@ -168,10 +170,6 @@ namespace gbe
> void WAIT(void);
> /*! Atomic instructions */
> virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
> - /*! Read 64-bits float/int arrays */
> - void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum);
> - /*! Write 64-bits float/int arrays */
> - void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t elemNum, bool is_scalar);
> /*! Untyped read (upto 4 channels) */
> virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
> /*! Untyped write (upto 4 channels) */
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index 2ab3aae..3f7154f 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -536,9 +536,9 @@ namespace gbe
> /*! Atomic instruction */
> void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
> /*! Read 64 bits float/int array */
> - void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t elemNum, uint32_t valueNum, uint32_t bti);
> + void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
> /*! Write 64 bits float/int array */
> - void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const GenRegister *dst, uint32_t dstNum, uint32_t bti);
> + void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
> /*! Untyped read (up to 4 elements) */
> void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
> /*! Untyped write (up to 4 elements) */
> @@ -1033,34 +1033,27 @@ namespace gbe
> void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
> void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
>
> - /* elemNum contains all the temporary register and the
> - real destination registers.*/
> void Selection::Opaque::READ64(Reg addr,
> - Reg tempAddr,
> const GenRegister *dst,
> uint32_t elemNum,
> - uint32_t valueNum,
> uint32_t bti)
> {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + 1, 1);
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
> SelectionVector *srcVector = this->appendVector();
> SelectionVector *dstVector = this->appendVector();
>
> // Regular instruction to encode
> for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
> insn->dst(elemID) = dst[elemID];
> - /* temporary addr register is to be modified, set it to dst registers.*/
> - insn->dst(elemNum) = tempAddr;
> +
> insn->src(0) = addr;
> insn->extra.function = bti;
> - insn->extra.elem = valueNum;
> + insn->extra.elem = elemNum;
>
> - // Only the temporary registers need contiguous allocation
> - dstVector->regNum = elemNum - valueNum;
> + dstVector->regNum = elemNum;
> dstVector->isSrc = 0;
> dstVector->reg = &insn->dst(0);
>
> - // Source cannot be scalar (yet)
> srcVector->regNum = 1;
> srcVector->isSrc = 1;
> srcVector->reg = &insn->src(0);
> @@ -1087,36 +1080,30 @@ namespace gbe
> dstVector->regNum = elemNum;
> dstVector->isSrc = 0;
> dstVector->reg = &insn->dst(0);
> - // Source cannot be scalar (yet)
> +
> srcVector->regNum = 1;
> srcVector->isSrc = 1;
> srcVector->reg = &insn->src(0);
> }
>
> - /* elemNum contains all the temporary register and the
> - real data registers.*/
> void Selection::Opaque::WRITE64(Reg addr,
> const GenRegister *src,
> uint32_t srcNum,
> - const GenRegister *dst,
> - uint32_t dstNum,
> uint32_t bti)
> {
> - SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 1);
> + SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
> SelectionVector *vector = this->appendVector();
>
> // Regular instruction to encode
> insn->src(0) = addr;
> for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
> insn->src(elemID + 1) = src[elemID];
> - for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
> - insn->dst(elemID) = dst[elemID];
> +
> insn->extra.function = bti;
> insn->extra.elem = srcNum;
>
> - // Only the addr + temporary registers need to be contiguous.
> - vector->regNum = dstNum;
> - vector->reg = &insn->dst(0);
> + vector->regNum = srcNum + 1;
> + vector->reg = &insn->src(0);
> vector->isSrc = 1;
> }
>
> @@ -2643,18 +2630,13 @@ namespace gbe
> {
> using namespace ir;
> const uint32_t valueNum = insn.getValueNum();
> - uint32_t dstID;
> /* XXX support scalar only right now. */
> GBE_ASSERT(valueNum == 1);
>
> - // The first 16 DWORD register space is for temporary usage at encode stage.
> - uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
> - GenRegister dst[valueNum + tmpRegNum];
> - for (dstID = 0; dstID < tmpRegNum ; ++dstID)
> - dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
> - for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
> - dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
> - sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, valueNum + tmpRegNum, valueNum, bti);
> + GenRegister dst[valueNum];
> + for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
> + dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
> + sel.READ64(addr, dst, valueNum, bti);
> }
>
> void emitByteGather(Selection::Opaque &sel,
> @@ -2803,22 +2785,14 @@ namespace gbe
> {
> using namespace ir;
> const uint32_t valueNum = insn.getValueNum();
> - uint32_t srcID;
> /* XXX support scalar only right now. */
> GBE_ASSERT(valueNum == 1);
> - addr = GenRegister::retype(addr, GEN_TYPE_F);
> - // The first 16 DWORD register space is for temporary usage at encode stage.
> - uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
> + addr = GenRegister::retype(addr, GEN_TYPE_UD);
> GenRegister src[valueNum];
> - GenRegister dst[tmpRegNum + 1];
> - /* dst 0 is for the temporary address register. */
> - dst[0] = sel.selReg(sel.reg(FAMILY_DWORD));
> - for (srcID = 0; srcID < tmpRegNum; ++srcID)
> - dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD));
>
> for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
> src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
> - sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti);
> + sel.WRITE64(addr, src, valueNum, bti);
> }
>
> void emitByteScatter(Selection::Opaque &sel,
> @@ -3009,6 +2983,11 @@ namespace gbe
> narrowNum = srcNum;
> narrowDst = 0;
> }
> + // As we store long/ulong low/high part separately,
> + // we need to deal with it separately, we need to change it back again
> + // when hardware support native long type.
> + const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
> + const int simdWidth = sel.curr.execWidth;
>
> for(int i = 0; i < narrowNum; i++, index++) {
> GenRegister narrowReg, wideReg;
> @@ -3030,16 +3009,26 @@ namespace gbe
> GBE_ASSERT(multiple == 8);
> }
> }
> - if(index % multiple) {
> +
> + if(!isInt64 && index % multiple) {
> wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
> wideReg.subphysical = 1;
> }
> + if(isInt64) {
> + // offset to next half
> + wideReg.subphysical = 1;
> + if(i >= multiple/2)
> + wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
> + if(index % (multiple/2))
> + wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
> + }
> +
> GenRegister xdst = narrowDst ? narrowReg : wideReg;
> GenRegister xsrc = narrowDst ? wideReg : narrowReg;
>
> - if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == TYPE_DOUBLE) ||
> - (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == TYPE_DOUBLE)) {
> - const int simdWidth = sel.curr.execWidth;
> + if(isInt64) {
> + sel.MOV(xdst, xsrc);
> + } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
> sel.push();
> sel.curr.execWidth = 8;
> xdst.subphysical = 1;
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 880a267..bab65e5 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -661,25 +661,38 @@ namespace gbe
> != spilledRegs.end())
> continue;
>
> - uint32_t alignment;
> - ir::RegisterFamily family;
> - getRegAttrib(reg, alignment, &family);
> - const uint32_t size = vector->regNum * alignment;
> - const uint32_t grfOffset = allocateReg(interval, size, alignment);
> + uint32_t alignment, maxAlignment = 0;
> + uint32_t size = 0;
> + for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
> + getRegAttrib(reg, alignment, NULL);
> + if(alignment > maxAlignment)
> + maxAlignment = alignment;
> + size += alignment;
> + }
> +
> + const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
> if(grfOffset == 0) {
> - GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
> + ir::RegisterFamily family;
> for(int i = vector->regNum-1; i >= 0; i--) {
> + family = ctx.sel->getRegisterFamily(vector->reg[i].reg());
> + // we currently only support DWORD/QWORD spill
> + if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
> + return false;
> if (!spillReg(vector->reg[i].reg()))
> return false;
> }
> continue;
> }
> + uint32_t subOffset = 0;
> for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
> const ir::Register reg = vector->reg[regID].reg();
> - GBE_ASSERT(RA.contains(reg) == false
> - && ctx.sel->getRegisterData(reg).family == family);
> - insertNewReg(reg, grfOffset + alignment * regID, true);
> - ctx.splitBlock(grfOffset, alignment * regID); //splitBlock will not split if regID == 0
> + GBE_ASSERT(RA.contains(reg) == false);
> + getRegAttrib(reg, alignment, NULL);
> + // check all sub registers aligned correctly
> + GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
> + insertNewReg(reg, grfOffset + subOffset, true);
> + ctx.splitBlock(grfOffset, subOffset); //splitBlock will not split if regID == 0
> + subOffset += alignment;
> }
> }
> // Case 2: This is a regular scalar register, allocate it alone
> diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
> index 50a6dcd..3967e6e 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -301,20 +301,25 @@ namespace gbe
> return false;
> }
>
> - INLINE GenRegister top_half(void) const {
> - GenRegister r = bottom_half();
> - r.subnr += 4;
> - r.nr += r.subnr / 32;
> - r.subnr %= 32;
> - return r;
> + INLINE GenRegister top_half(int simdWidth) const {
> + GBE_ASSERT(isint64());
> + GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
> +
> + if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
> + reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
> + reg.nr += reg.subnr / 32;
> + reg.subnr %= 32;
> + } else {
> + reg.subnr += typeSize(reg.type);
> + reg.nr += reg.subnr/32;
> + reg.subnr %= 32;
> + }
> + return reg;
> }
>
> INLINE GenRegister bottom_half(void) const {
> GBE_ASSERT(isint64());
> - GenRegister r = h2(*this);
> - r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
> - if(r.vstride != GEN_VERTICAL_STRIDE_0)
> - r.vstride = GEN_VERTICAL_STRIDE_16;
> + GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
> return r;
> }
>
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 82429d0..bd111b0 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -1868,6 +1868,11 @@ namespace gbe
> uint32_t srcElemNum = 0, dstElemNum = 0 ;
> ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
> ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
> + // As long and double are not compatible in register storage
> + // and we do not support double yet, simply put an assert here
> + GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
> + GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
> +
> if(srcElemNum > 1 || dstElemNum > 1) {
> // Build the tuple data in the vector
> vector<ir::Register> srcTupleData;
> --
> 1.7.10.4
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list