[Beignet] [PATCH 1/2] Support 64-bit float
Xing, Homer
homer.xing at intel.com
Thu Jun 6 01:07:56 PDT 2013
I will refine the commit log, and check the problems.
If you have better idea about how to implement 64-bit float function please tell me. Thanks.
-----Original Message-----
From: Zhigang Gong [mailto:zhigang.gong at linux.intel.com]
Sent: Thursday, June 06, 2013 3:59 PM
To: Xing, Homer; beignet at lists.freedesktop.org
Subject: RE: [Beignet] [PATCH 1/2] Support 64-bit float
The commit log only says it support 64-bit float immediate? Is it the only goal of this commit? I'm afraid not, could you refine the log to introduce this commit more accurate?
And I still haven't review the whole commit. I just tried to run the test case on my machine (32 bit with llvm 3.3), And I found the following
problems:
By default, it ran into one failure:
compiler_menger_sponge_no_shadow:
compiler_menger_sponge_no_shadow() [FAILED]
Error: image mismatch
at file /home/gong/git/fdo/beignet/utests/compiler_shader_toy.cpp,
function run_kernel, line 63
By set the OCL_SIMD_WIDTH=8, it got another failure:
compiler_double:
utest_run:
/home/gong/git/fdo/beignet/backend/src/backend/gen_encoder.cpp:250: void gbe::GenEncoder::setDst(GenInstruction*, gbe::GenRegister): Assertion `dest.nr < 128' failed.
Aborted (core dumped)
But it seems the first failure get passed with SIMD8 mode.
Could you look at the above problems?
> -----Original Message-----
> From:
> beignet-bounces+zhigang.gong=linux.intel.com at lists.freedesktop.org
>
[mailto:beignet-bounces+zhigang.gong=linux.intel.com at lists.freedesktop.org]
> On Behalf Of Homer Hsing
> Sent: Thursday, June 06, 2013 2:16 PM
> To: beignet at lists.freedesktop.org
> Cc: Homer Hsing
> Subject: [Beignet] [PATCH 1/2] Support 64-bit float
>
> support 64-bit float immediate
>
> example:
>
> kernel void f(global double *src, global double *dst) {
> int i = get_global_id(0);
> double d = 1.234567890123456789;
> dst[i] = d * (src[i] + d);
> }
>
> Signed-off-by: Homer Hsing <homer.xing at intel.com>
> ---
> backend/src/backend/gen_context.cpp | 16 ++++-
> backend/src/backend/gen_defs.hpp | 1 +
> backend/src/backend/gen_encoder.cpp | 107
> +++++++++++++++++++++++++++--
> backend/src/backend/gen_encoder.hpp | 6 +-
> backend/src/backend/gen_insn_selection.cpp | 46 +++++++++----
> backend/src/backend/gen_insn_selection.hpp | 4 ++
> backend/src/backend/gen_reg_allocation.cpp | 10 ++-
> backend/src/backend/gen_register.hpp | 80
> +++++++++++++++++++++
> backend/src/ir/profile.cpp | 2 +
> backend/src/ir/profile.hpp | 3 +-
> 10 files changed, 249 insertions(+), 26 deletions(-)
>
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 055c8fc..e6080b2 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -138,7 +138,15 @@ namespace gbe
> const GenRegister dst = ra->genReg(insn.dst(0));
> const GenRegister src = ra->genReg(insn.src(0));
> switch (insn.opcode) {
> - case SEL_OP_MOV: p->MOV(dst, src); break;
> + case SEL_OP_MOV:
> + if (dst.isdf() && !src.isdf()) {
> + bool doubleio = true;
> + const GenRegister r =
> ra->genReg(GenRegister::f16grf(ir::ocl::doubleio));
> + p->MOV(dst, src, doubleio, r);
> + } else {
> + p->MOV(dst, src);
> + }
> + break;
> case SEL_OP_NOT: p->NOT(dst, src); break;
> case SEL_OP_RNDD: p->RNDD(dst, src); break;
> case SEL_OP_RNDU: p->RNDU(dst, src); break; @@ -263,14 +271,16
> @@ namespace gbe
> const GenRegister src = ra->genReg(insn.src(0));
> const uint32_t bti = insn.extra.function;
> const uint32_t elemNum = insn.extra.elem;
> - p->UNTYPED_READ(dst, src, bti, elemNum);
> + const GenRegister r =
> ra->genReg(GenRegister::ud16grf(ir::ocl::doubleio));
> + p->UNTYPED_READ(dst, src, bti, elemNum, insn.doubleio, r);
> }
>
> void GenContext::emitUntypedWriteInstruction(const
> SelectionInstruction
> &insn) {
> const GenRegister src = ra->genReg(insn.src(0));
> const uint32_t bti = insn.extra.function;
> const uint32_t elemNum = insn.extra.elem;
> - p->UNTYPED_WRITE(src, bti, elemNum);
> + const GenRegister r =
> ra->genReg(GenRegister::ud16grf(ir::ocl::doubleio));
> + p->UNTYPED_WRITE(src, bti, elemNum, insn.doubleio, r);
> }
>
> void GenContext::emitByteGatherInstruction(const
> SelectionInstruction
> &insn) { diff --git a/backend/src/backend/gen_defs.hpp
> b/backend/src/backend/gen_defs.hpp
> index c7a1581..63f98f5 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -215,6 +215,7 @@ enum GenMessageTarget { #define GEN_TYPE_VF
> 5 /* packed float vector, immediates only? */ #define GEN_TYPE_HF 6
> #define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest
> only */
> +#define GEN_TYPE_DF 6
> #define GEN_TYPE_F 7
>
> #define GEN_ARF_NULL 0x00
> diff --git a/backend/src/backend/gen_encoder.cpp
> b/backend/src/backend/gen_encoder.cpp
> index b65cc94..db08e90 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -355,7 +355,29 @@ namespace gbe
> 0
> };
>
> - void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src,
> uint32_t bti, uint32_t elemNum) {
> + void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src,
> uint32_t bti, uint32_t elemNum, bool doubleio, GenRegister r) {
> + if (doubleio) {
> + GenRegister hdr = GenRegister::h2(r);
> + GenRegister imm4 = GenRegister::immud(4);
> + push();
> + curr.execWidth = 8;
> + MOV(hdr,
> GenRegister::ud8grf(src.nr, 0));
> + ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> + MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(src.nr,
> 4));
> + ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr,
> + 1),
> imm4);
> + pop();
> + UNTYPED_READ(dst, hdr, bti, elemNum, false, r);
> +
> + push();
> + curr.execWidth = 8;
> + MOV(hdr,
> GenRegister::ud8grf(src.nr + 1, 0));
> + ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> + MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(src.nr +
> 1, 4));
> + ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr,
> + 1),
> imm4);
> + pop();
> + UNTYPED_READ(GenRegister::offset(dst, 2), hdr, bti, elemNum,
> + false,
> r);
> + return;
> + }
> GenInstruction *insn = this->next(GEN_OPCODE_SEND);
> assert(elemNum >= 1 || elemNum <= 4);
> uint32_t msg_length = 0;
> @@ -382,7 +404,32 @@ namespace gbe
> response_length);
> }
>
> - void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti,
> uint32_t elemNum) {
> + void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti,
> uint32_t elemNum, bool doubleio, GenRegister r) {
> + if (doubleio) {
> + GenRegister hdr = GenRegister::h2(r);
> + GenRegister data = GenRegister::offset(r, 2);
> + GenRegister imm4 = GenRegister::immud(4);
> + push();
> + curr.execWidth = 8;
> + MOV(hdr,
> GenRegister::ud8grf(msg.nr, 0));
> + ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> + MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(msg.nr,
> 4));
> + ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr,
> + 1),
> imm4);
> + pop();
> + MOV(data, GenRegister::ud16grf(msg.nr+2, 0));
> + UNTYPED_WRITE(hdr, bti, elemNum, false, r);
> +
> + push();
> + curr.execWidth = 8;
> + MOV(hdr,
> GenRegister::ud8grf(msg.nr+1, 0));
> + ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
> + MOV(GenRegister::offset(hdr, 1),
> GenRegister::ud8grf(msg.nr+1, 4));
> + ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr,
> + 1),
> imm4);
> + pop();
> + MOV(data, GenRegister::ud16grf(msg.nr+4, 0));
> + UNTYPED_WRITE(hdr, bti, elemNum, false, r);
> + return;
> + }
> GenInstruction *insn = this->next(GEN_OPCODE_SEND);
> assert(elemNum >= 1 || elemNum <= 4);
> uint32_t msg_length = 0;
> @@ -467,7 +514,17 @@ namespace gbe
> }
>
> INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
> GenRegister src) {
> - if (needToSplitAlu1(p, dst, src) == false) {
> + if (opcode != GEN_OPCODE_MOV && dst.isdf() && src.isdf()) {
> + GenInstruction *insn = p->next(opcode);
> + p->setHeader(insn);
> + p->setDst(insn, dst);
> + p->setSrc0(insn, src);
> + int w = p->curr.execWidth / 4;
> + insn = p->next(opcode);
> + p->setHeader(insn);
> + p->setDst(insn, GenRegister::QnPhysical(dst, w));
> + p->setSrc0(insn, GenRegister::QnPhysical(src, w));
> + } else if (needToSplitAlu1(p, dst, src) == false) {
> GenInstruction *insn = p->next(opcode);
> p->setHeader(insn);
> p->setDst(insn, dst);
> @@ -499,7 +556,19 @@ namespace gbe
> GenRegister src0,
> GenRegister src1)
> {
> - if (needToSplitAlu2(p, dst, src0, src1) == false) {
> + if (dst.isdf() && src0.isdf() && src1.isdf()) {
> + GenInstruction *insn = p->next(opcode);
> + p->setHeader(insn);
> + p->setDst(insn, dst);
> + p->setSrc0(insn, src0);
> + p->setSrc1(insn, src1);
> + int w = p->curr.execWidth / 4;
> + insn = p->next(opcode);
> + p->setHeader(insn);
> + p->setDst(insn, GenRegister::QnPhysical(dst, w));
> + p->setSrc0(insn, GenRegister::QnPhysical(src0, w));
> + p->setSrc1(insn, GenRegister::QnPhysical(src1, w));
> + } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
> GenInstruction *insn = p->next(opcode);
> p->setHeader(insn);
> p->setDst(insn, dst);
> @@ -620,7 +689,35 @@ namespace gbe
> alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
> }
>
> - ALU1(MOV)
> + void GenEncoder::MOV(GenRegister dest, GenRegister src0, bool
> + doubleio,
> GenRegister r) {
> + if (doubleio) {
> + GenRegister r0 = GenRegister::h2(r);
> + push();
> + curr.execWidth = 8;
> + if(src0.isimmdf()) {
> + union { double d; unsigned u[2]; } u;
> + u.d = src0.value.df;
> + r0 = GenRegister::retype(r0, GEN_TYPE_UD);
> + GenRegister imm0 = GenRegister::immud(u.u[0]);
> + GenRegister imm1 = GenRegister::immud(u.u[1]);
> + MOV(r0, imm0);
> + MOV(GenRegister::suboffset(r0, 1), imm1);
> + MOV(GenRegister::offset(r0, 1), imm0);
> + MOV(GenRegister::suboffset(GenRegister::offset(r0, 1), 1),
> imm1);
> + } else {
> + MOV(r0, src0);
> + MOV(GenRegister::offset(r0, 1), GenRegister::offset(src0,
> + 0,
> 16));
> + }
> + pop();
> + if(src0.isimmdf())
> + r = GenRegister::retype(r, GEN_TYPE_DF);
> + MOV(dest, r);
> + MOV(GenRegister::offset(dest, 2), r);
> + } else {
> + alu1(this, GEN_OPCODE_MOV, dest, src0);
> + }
> + }
> +
> ALU1(RNDZ)
> ALU1(RNDE)
> ALU1(RNDD)
> diff --git a/backend/src/backend/gen_encoder.hpp
> b/backend/src/backend/gen_encoder.hpp
> index 83d83d2..3dc55b6 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -89,7 +89,7 @@ namespace gbe
> #define ALU1(OP) void OP(GenRegister dest, GenRegister src0);
> #define
> ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister
> src1); #define ALU3(OP) void OP(GenRegister dest, GenRegister src0,
> GenRegister src1, GenRegister src2);
> - ALU1(MOV)
> + void MOV(GenRegister dest, GenRegister src0, bool doubleio =
> + false, GenRegister r = GenRegister());
> ALU1(RNDZ)
> ALU1(RNDE)
> ALU1(RNDD)
> @@ -131,9 +131,9 @@ namespace gbe
> /*! Wait instruction (used for the barrier) */
> void WAIT(void);
> /*! Untyped read (upto 4 channels) */
> - void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti,
> uint32_t elemNum);
> + void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti,
> + uint32_t elemNum, bool doubleio, GenRegister r);
> /*! Untyped write (upto 4 channels) */
> - void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
> + void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t
> + elemNum, bool doubleio, GenRegister r);
> /*! Byte gather (for unaligned bytes, shorts and ints) */
> void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti,
> uint32_t elemSize);
> /*! Byte scatter (for unaligned bytes, shorts and ints) */ diff
> --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 88f9e94..6a845cf 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -129,6 +129,7 @@ namespace gbe
> case TYPE_S32: return GEN_TYPE_D;
> case TYPE_U32: return GEN_TYPE_UD;
> case TYPE_FLOAT: return GEN_TYPE_F;
> + case TYPE_DOUBLE: return GEN_TYPE_DF;
> default: NOT_SUPPORTED; return GEN_TYPE_F;
> }
> }
> @@ -151,7 +152,7 @@ namespace gbe
>
///////////////////////////////////////////////////////////////////////////
>
> SelectionInstruction::SelectionInstruction(SelectionOpcode op,
> uint32_t
dst,
> uint32_t src) :
> - parent(NULL), opcode(op), dstNum(dst), srcNum(src)
> + parent(NULL), opcode(op), dstNum(dst), srcNum(src),
> + doubleio(false)
> {}
>
> void SelectionInstruction::prepend(SelectionInstruction &other) {
> @@
> -448,9 +449,9 @@ namespace gbe
> /*! Wait instruction (used for the barrier) */
> void WAIT(void);
> /*! Untyped read (up to 4 elements) */
> - void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t
> elemNum, uint32_t bti);
> + void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t
> + elemNum, uint32_t bti, bool doubleio);
> /*! Untyped write (up to 4 elements) */
> - void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t
> elemNum, uint32_t bti);
> + void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t
> + elemNum, uint32_t bti, bool doubleio);
> /*! Byte gather (for unaligned bytes, shorts and ints) */
> void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
> /*! Byte scatter (for unaligned bytes, shorts and ints) */ @@
> -655,6
> +656,7 @@ namespace gbe
> case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
> case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
> case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
> + case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
> default: NOT_SUPPORTED;
> }
> GBE_ASSERT(false);
> @@ -715,7 +717,8 @@ namespace gbe
> void Selection::Opaque::UNTYPED_READ(Reg addr,
> const GenRegister *dst,
> uint32_t elemNum,
> - uint32_t bti)
> + uint32_t bti,
> + bool doubleio)
> {
> SelectionInstruction *insn =
> this->appendInsn(SEL_OP_UNTYPED_READ,
> elemNum, 1);
> SelectionVector *srcVector = this->appendVector(); @@ -727,11
> +730,13 @@ namespace gbe
> insn->src(0) = addr;
> insn->extra.function = bti;
> insn->extra.elem = elemNum;
> + insn->doubleio = doubleio;
>
> // Sends require contiguous allocation
> dstVector->regNum = elemNum;
> dstVector->isSrc = 0;
> dstVector->reg = &insn->dst(0);
> + dstVector->doubleio = doubleio;
>
> // Source cannot be scalar (yet)
> srcVector->regNum = 1;
> @@ -742,7 +747,8 @@ namespace gbe
> void Selection::Opaque::UNTYPED_WRITE(Reg addr,
> const GenRegister *src,
> uint32_t elemNum,
> - uint32_t bti)
> + uint32_t bti,
> + bool doubleio)
> {
> SelectionInstruction *insn =
> this->appendInsn(SEL_OP_UNTYPED_WRITE,
> 0, elemNum+1);
> SelectionVector *vector = this->appendVector(); @@ -753,11
> +759,13 @@ namespace gbe
> insn->src(elemID+1) = src[elemID];
> insn->extra.function = bti;
> insn->extra.elem = elemNum;
> + insn->doubleio = doubleio;
>
> // Sends require contiguous allocation for the sources
> vector->regNum = elemNum+1;
> vector->reg = &insn->src(0);
> vector->isSrc = 1;
> + vector->doubleio = doubleio;
> }
>
> void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t
> elemSize, uint32_t bti) { @@ -1085,6 +1093,15 @@ namespace gbe
> // Implementation of all patterns
>
///////////////////////////////////////////////////////////////////////////
>
> + bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
> + using namespace ir;
> + const auto &childInsn = cast<LoadImmInstruction>(insn);
> + const auto &imm = childInsn.getImmediate();
> + if(imm.type != TYPE_DOUBLE)
> + return true;
> + return false;
> + }
> +
> GenRegister getRegisterFromImmediate(ir::Immediate imm)
> {
> using namespace ir;
> @@ -1096,6 +1113,7 @@ namespace gbe
> case TYPE_S16: return GenRegister::immw(imm.data.s16);
> case TYPE_U8: return GenRegister::immuw(imm.data.u8);
> case TYPE_S8: return GenRegister::immw(imm.data.s8);
> + case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
> default: NOT_SUPPORTED; return GenRegister::immuw(0);
> }
> }
> @@ -1218,14 +1236,14 @@ namespace gbe
> SelectionDAG *dag1 = dag.child[1];
>
> // Right source can always be an immediate
> - if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> dag1->insn.getOpcode() == OP_LOADI) {
> + if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> + dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
> const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> src0 = sel.selReg(insn.getSrc(0), type);
> src1 = getRegisterFromImmediate(childInsn.getImmediate());
> if (dag0) dag0->isRoot = 1;
> }
> // Left source cannot be immediate but it is OK if we can commute
> - else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
> + else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
> + insn.commutes() && dag0->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag0->insn)) {
> const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
> src0 = sel.selReg(insn.getSrc(1), type);
> src1 = getRegisterFromImmediate(childInsn.getImmediate());
> @@ -1261,7 +1279,7 @@ namespace gbe
> case OP_SHR: sel.SHR(dst, src0, src1); break;
> case OP_ASR: sel.ASR(dst, src0, src1); break;
> case OP_MUL:
> - if (type == TYPE_FLOAT)
> + if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
> sel.MUL(dst, src0, src1);
> else if (type == TYPE_U32 || type == TYPE_S32) {
> sel.pop();
> @@ -1592,6 +1610,7 @@ namespace gbe
> case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16));
> break;
> case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.data.u8));
> break;
> case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8));
> break;
> + case TYPE_DOUBLE: sel.MOV(dst,
> + GenRegister::immdf(imm.data.f64)); break;
> default: NOT_SUPPORTED;
> }
> sel.pop();
> @@ -1639,6 +1658,7 @@ namespace gbe
> INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
> using namespace ir;
> switch (type) {
> + case TYPE_DOUBLE:
> case TYPE_FLOAT:
> case TYPE_U32:
> case TYPE_S32:
> @@ -1665,9 +1685,10 @@ namespace gbe
> using namespace ir;
> const uint32_t valueNum = insn.getValueNum();
> vector<GenRegister> dst(valueNum);
> + bool doubleio = insn.getValueType() == TYPE_DOUBLE ? true :
> + false;
> for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
> dst[dstID] =
GenRegister::retype(sel.selReg(insn.getValue(dstID)),
> GEN_TYPE_F);
> - sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
> + sel.UNTYPED_READ(addr, dst.data(), valueNum, bti, doubleio);
> }
>
> void emitByteGather(Selection::Opaque &sel, @@ -1744,11 +1765,12
> @@ namespace gbe
> const uint32_t addrID = ir::StoreInstruction::addressIndex;
> GenRegister addr;
> vector<GenRegister> value(valueNum);
> + bool doubleio = insn.getValueType() == TYPE_DOUBLE ? true :
> + false;
>
> addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)),
> GEN_TYPE_F);;
> for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
> value[valueID] =
> GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
> - sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
> + sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti, doubleio);
> }
>
> void emitByteScatter(Selection::Opaque &sel, @@ -1828,7 +1850,7
> @@ namespace gbe
> SelectionDAG *dag1 = dag.child[1];
>
> // Right source can always be an immediate
> - if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> dag1->insn.getOpcode() == OP_LOADI) {
> + if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL &&
> + dag1->insn.getOpcode() == OP_LOADI &&
> + canGetRegisterFromImmediate(dag1->insn)) {
> const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
> src0 = sel.selReg(insn.getSrc(0), type);
> src1 = getRegisterFromImmediate(childInsn.getImmediate());
> @@ -1862,7 +1884,7 @@ namespace gbe
> const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
>
> // We need two instructions to make the conversion
> - if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
> + if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD &&
> + srcFamily == FAMILY_DWORD) {
> GenRegister unpacked;
> if (dstFamily == FAMILY_WORD) {
> const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
> diff --git a/backend/src/backend/gen_insn_selection.hpp
> b/backend/src/backend/gen_insn_selection.hpp
> index 778eb1f..5f7289d 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -116,6 +116,8 @@ namespace gbe
> uint8_t srcNum:5;
> /*! To store various indices */
> uint16_t index;
> + /*! Double size I/O ? */
> + bool doubleio;
> /*! Variable sized. Destinations and sources go here */
> GenRegister regs[0];
> private:
> @@ -138,6 +140,8 @@ namespace gbe
> uint16_t regNum;
> /*! Indicate if this a destination or a source vector */
> uint16_t isSrc;
> + /*! "double IO" requires two more register */
> + bool doubleio;
> };
>
> // Owns the selection block
> diff --git a/backend/src/backend/gen_reg_allocation.cpp
> b/backend/src/backend/gen_reg_allocation.cpp
> index 469be12..6b63c41 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -454,7 +454,6 @@ namespace gbe
> }
>
> bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
> -
> // Perform the linear scan allocator
> const uint32_t regNum = ctx.sel->getRegNum();
> for (uint32_t startID = 0; startID < regNum; ++startID) { @@
> -472,7
> +471,9 @@ namespace gbe
> const SelectionVector *vector = it->second.first;
> const uint32_t simdWidth = ctx.getSimdWidth();
> const uint32_t alignment = simdWidth * sizeof(uint32_t);
> - const uint32_t size = vector->regNum * alignment;
> + uint32_t size = vector->regNum * alignment;
> + if (vector->doubleio)
> + size += alignment;
> uint32_t grfOffset;
> while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
> const bool success = this->expireGRF(interval); @@ -667,6
> +668,11 @@ namespace gbe
> // First we try to put all booleans registers into flags
> this->allocateFlags(selection);
>
> + int w = ctx.getSimdWidth();
> + int offst = ctx.allocate(w * sizeof(int) * 2, w * sizeof(int));
> + GBE_ASSERT(offst != 0);
> + RA.insert(std::make_pair(ocl::doubleio, offst));
> +
> // Allocate all the GRFs now (regular register and boolean that
> are
not in
> // flag registers)
> return this->allocateGRFs(selection); diff --git
> a/backend/src/backend/gen_register.hpp
> b/backend/src/backend/gen_register.hpp
> index d772b0d..5870b07 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -192,6 +192,7 @@ namespace gbe
>
> /*! For immediates or virtual register */
> union {
> + double df;
> float f;
> int32_t d;
> uint32_t ud;
> @@ -211,6 +212,31 @@ namespace gbe
> uint32_t quarter:1; //!< To choose which part we want (Q1 / Q2)
> uint32_t address_mode:1; //!< direct or indirect
>
> + static INLINE GenRegister offset(GenRegister reg, int nr, int
> + subnr =
0) {
> + GenRegister r = reg;
> + r.nr += nr;
> + r.subnr += subnr;
> + return r;
> + }
> +
> + INLINE bool isimmdf(void) const {
> + if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
> + return true;
> + return false;
> + }
> +
> + INLINE bool isdf(void) const {
> + if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
> + return true;
> + return false;
> + }
> +
> + static INLINE GenRegister h2(GenRegister reg) {
> + GenRegister r = reg;
> + r.hstride = GEN_HORIZONTAL_STRIDE_2;
> + return r;
> + }
> +
> static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t
quarter) {
> GBE_ASSERT(reg.physical == 0);
> if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
> @@
> -293,6 +319,18 @@ namespace gbe
> return reg;
> }
>
> + static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
> + return retype(vec16(file, reg), GEN_TYPE_DF);
> + }
> +
> + static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
> + return retype(vec8(file, reg), GEN_TYPE_DF);
> + }
> +
> + static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
> + return retype(vec1(file, reg), GEN_TYPE_DF);
> + }
> +
> static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
> return retype(vec16(file, reg), GEN_TYPE_UD);
> }
> @@ -371,6 +409,12 @@ namespace gbe
> GEN_HORIZONTAL_STRIDE_0);
> }
>
> + static INLINE GenRegister immdf(double df) {
> + GenRegister immediate = imm(GEN_TYPE_DF);
> + immediate.value.df = df;
> + return immediate;
> + }
> +
> static INLINE GenRegister immf(float f) {
> GenRegister immediate = imm(GEN_TYPE_F);
> immediate.value.f = f;
> @@ -448,6 +492,18 @@ namespace gbe
> return vec16(GEN_GENERAL_REGISTER_FILE, reg);
> }
>
> + static INLINE GenRegister df1grf(ir::Register reg) {
> + return df1(GEN_GENERAL_REGISTER_FILE, reg);
> + }
> +
> + static INLINE GenRegister df8grf(ir::Register reg) {
> + return df8(GEN_GENERAL_REGISTER_FILE, reg);
> + }
> +
> + static INLINE GenRegister df16grf(ir::Register reg) {
> + return df16(GEN_GENERAL_REGISTER_FILE, reg);
> + }
> +
> static INLINE GenRegister ud16grf(ir::Register reg) {
> return ud16(GEN_GENERAL_REGISTER_FILE, reg);
> }
> @@ -613,6 +669,18 @@ namespace gbe
> return reg;
> }
>
> + static INLINE GenRegister df16(uint32_t file, uint32_t nr,
> + uint32_t
subnr)
> {
> + return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
> + }
> +
> + static INLINE GenRegister df8(uint32_t file, uint32_t nr,
> + uint32_t
subnr) {
> + return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
> + }
> +
> + static INLINE GenRegister df1(uint32_t file, uint32_t nr,
> + uint32_t
subnr) {
> + return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
> + }
> +
> static INLINE GenRegister ud16(uint32_t file, uint32_t nr,
> uint32_t
subnr)
> {
> return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
> }
> @@ -685,6 +753,18 @@ namespace gbe
> return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> }
>
> + static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
> + return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> + }
> +
> + static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
> + return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> + }
> +
> + static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
> + return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> + }
> +
> static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
> return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
> }
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index 99cd06c..9a17f1d 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -41,6 +41,7 @@ namespace ir {
> "block_ip",
> "barrier_id", "thread_number",
> "const_curbe_offset",
> + "double_io",
> };
>
> #if GBE_DEBUG
> @@ -77,6 +78,7 @@ namespace ir {
> DECL_NEW_REG(FAMILY_DWORD, threadn);
> DECL_NEW_REG(FAMILY_DWORD, constoffst);
> DECL_NEW_REG(FAMILY_DWORD, workdim);
> + DECL_NEW_REG(FAMILY_DWORD, doubleio);
> }
> #undef DECL_NEW_REG
>
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 4b0ef5e..029b47c 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -65,7 +65,8 @@ namespace ir {
> static const Register threadn = Register(21); // number of threads
> static const Register constoffst = Register(22); // offset of
> global constant array's curbe
> static const Register workdim = Register(23); // work dimention.
> - static const uint32_t regNum = 24; // number of special
> registers
> + static const Register doubleio = Register(24); // work dimention.
> + static const uint32_t regNum = 25; // number of special
> registers
> extern const char *specialRegMean[]; // special register
> name.
> } /* namespace ocl */
>
> --
> 1.8.1.2
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list