[Beignet] [PATCH 1/2] Support 64-bit float
Zhigang Gong
zhigang.gong at linux.intel.com
Tue Jun 18 02:10:45 PDT 2013
From: Homer Hsing <homer.xing at intel.com>
support arithmetic, store, load, and 64-bit float immediate
example:
kernel void f(global double *src, global double *dst) {
int i = get_global_id(0);
double d = 1.234567890123456789;
dst[i] = d * (src[i] + d);
}
Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
backend/src/backend/gen_context.cpp | 17 +++
backend/src/backend/gen_context.hpp | 2 +
backend/src/backend/gen_defs.hpp | 2 +
backend/src/backend/gen_encoder.cpp | 159 ++++++++++++++++++++-
backend/src/backend/gen_encoder.hpp | 6 +
.../src/backend/gen_insn_gen7_schedule_info.hxx | 2 +
backend/src/backend/gen_insn_selection.cpp | 148 +++++++++++++++++--
backend/src/backend/gen_insn_selection.hxx | 4 +
backend/src/backend/gen_reg_allocation.cpp | 1 -
backend/src/backend/gen_register.hpp | 90 +++++++++++-
10 files changed, 414 insertions(+), 17 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index af651e7..91ae72b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -139,6 +139,7 @@ namespace gbe
const GenRegister src = ra->genReg(insn.src(0));
switch (insn.opcode) {
case SEL_OP_MOV: p->MOV(dst, src); break;
+ case SEL_OP_LOAD_DF_IMM: p->LOAD_DF_IMM(dst, src); break;
case SEL_OP_NOT: p->NOT(dst, src); break;
case SEL_OP_RNDD: p->RNDD(dst, src); break;
case SEL_OP_RNDU: p->RNDU(dst, src); break;
@@ -153,6 +154,7 @@ namespace gbe
const GenRegister src0 = ra->genReg(insn.src(0));
const GenRegister src1 = ra->genReg(insn.src(1));
switch (insn.opcode) {
+ case SEL_OP_MOV_DF: p->MOV_DF(dst, src0, src1); break;
case SEL_OP_SEL: p->SEL(dst, src0, src1); break;
case SEL_OP_AND: p->AND(dst, src0, src1); break;
case SEL_OP_OR: p->OR (dst, src0, src1); break;
@@ -264,6 +266,14 @@ namespace gbe
p->pop();
}
+ void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.extra.function;
+ const uint32_t elemNum = insn.extra.elem;
+ p->READ_FLOAT64(dst, src, bti, elemNum);
+ }
+
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
@@ -272,6 +282,13 @@ namespace gbe
p->UNTYPED_READ(dst, src, bti, elemNum);
}
+ void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
+ const GenRegister src = ra->genReg(insn.src(0));
+ const uint32_t bti = insn.extra.function;
+ const uint32_t elemNum = insn.extra.elem;
+ p->WRITE_FLOAT64(src, bti, elemNum);
+ }
+
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const uint32_t bti = insn.extra.function;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 1566cbb..804384d 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -87,6 +87,8 @@ namespace gbe
void emitBarrierInstruction(const SelectionInstruction &insn);
void emitFenceInstruction(const SelectionInstruction &insn);
void emitMathInstruction(const SelectionInstruction &insn);
+ void emitReadFloat64Instruction(const SelectionInstruction &insn);
+ void emitWriteFloat64Instruction(const SelectionInstruction &insn);
void emitUntypedReadInstruction(const SelectionInstruction &insn);
void emitUntypedWriteInstruction(const SelectionInstruction &insn);
void emitByteGatherInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index f4e4938..0bfd66c 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -215,6 +215,7 @@ enum GenMessageTarget {
#define GEN_TYPE_VF 5 /* packed float vector, immediates only? */
#define GEN_TYPE_HF 6
#define GEN_TYPE_V 6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF 6
#define GEN_TYPE_F 7
#define GEN_ARF_NULL 0x00
@@ -303,6 +304,7 @@ enum GenMessageTarget {
#define GEN_BYTE_SCATTER_BYTE 0
#define GEN_BYTE_SCATTER_WORD 1
#define GEN_BYTE_SCATTER_DWORD 2
+#define GEN_BYTE_SCATTER_QWORD 3
#define GEN_SAMPLER_RETURN_FORMAT_FLOAT32 0
#define GEN_SAMPLER_RETURN_FORMAT_UINT32 2
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 859a1b9..190e39e 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -355,6 +355,77 @@ namespace gbe
0
};
+ void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ int w = curr.execWidth;
+ GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
+ GenRegister hdr = GenRegister::h2(r);
+ GenRegister imm4 = GenRegister::immud(4);
+ push();
+ curr.execWidth = 8;
+ MOV(hdr, GenRegister::ud8grf(src.nr, 0));
+ ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+ if (w == 16) {
+ curr.quarterControl = 1;
+ MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(src.nr, 4));
+ ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+ }
+ pop();
+ UNTYPED_READ(dst, hdr, bti, 1);
+
+ push();
+ curr.execWidth = 8;
+ MOV(hdr, w == 16 ? GenRegister::ud8grf(src.nr+1, 0) : GenRegister::retype(GenRegister::offset(src, 0, 16), GEN_TYPE_UD));
+ ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+ if (w == 16) {
+ curr.quarterControl = 1;
+ MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(src.nr + 1, 4));
+ ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+ }
+ pop();
+ push();
+ curr.quarterControl = 1;
+ UNTYPED_READ(GenRegister::offset(dst, w / 8), hdr, bti, 1);
+ pop();
+ }
+
+ void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ int w = curr.execWidth;
+ GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
+ r.type = GEN_TYPE_UD;
+ GenRegister hdr = GenRegister::h2(r);
+ GenRegister data = GenRegister::offset(r, w / 8);
+ GenRegister imm4 = GenRegister::immud(4);
+ push();
+ curr.execWidth = 8;
+ MOV(hdr, GenRegister::ud8grf(msg.nr, 0));
+ ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+ if (w == 16) {
+ curr.quarterControl = 1;
+ MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(msg.nr, 4));
+ ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+ }
+ pop();
+
+ MOV(data, GenRegister::ud16grf(msg.nr + w / 8, 0));
+ UNTYPED_WRITE(hdr, bti, 1);
+
+ push();
+ curr.execWidth = 8;
+ MOV(hdr, w == 16 ? GenRegister::ud8grf(msg.nr+1, 0) : GenRegister::retype(GenRegister::offset(msg, 0, 16), GEN_TYPE_UD));
+ ADD(GenRegister::offset(hdr, 0, 4), hdr, imm4);
+ if (w == 16) {
+ curr.quarterControl = 1;
+ MOV(GenRegister::offset(hdr, 1), GenRegister::ud8grf(msg.nr+1, 4));
+ ADD(GenRegister::offset(hdr, 1, 4), GenRegister::offset(hdr, 1), imm4);
+ }
+ pop();
+ push();
+ curr.quarterControl = 1;
+ MOV(data, GenRegister::ud16grf(msg.nr + w / 4, 0));
+ UNTYPED_WRITE(hdr, bti, 1);
+ pop();
+ }
+
void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
GenInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
@@ -467,7 +538,21 @@ namespace gbe
}
INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
- if (needToSplitAlu1(p, dst, src) == false) {
+ if (dst.isdf() && src.isdf()) {
+ int w = p->curr.execWidth;
+ p->push();
+ GenInstruction *insn = p->next(opcode);
+ insn->header.quarter_control = GEN_COMPRESSION_H1;
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src);
+ insn = p->next(opcode);
+ insn->header.quarter_control = GEN_COMPRESSION_H2;
+ p->setHeader(insn);
+ p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+ p->pop();
+ } else if (needToSplitAlu1(p, dst, src) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
@@ -499,7 +584,23 @@ namespace gbe
GenRegister src0,
GenRegister src1)
{
- if (needToSplitAlu2(p, dst, src0, src1) == false) {
+ if (dst.isdf() && src0.isdf() && src1.isdf()) {
+ int w = p->curr.execWidth;
+ p->push();
+ GenInstruction *insn = p->next(opcode);
+ insn->header.quarter_control = GEN_COMPRESSION_H1;
+ p->setHeader(insn);
+ p->setDst(insn, dst);
+ p->setSrc0(insn, src0);
+ p->setSrc1(insn, src1);
+ insn = p->next(opcode);
+ insn->header.quarter_control = GEN_COMPRESSION_H2;
+ p->setHeader(insn);
+ p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+ p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+ p->pop();
+ } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
@@ -620,6 +721,60 @@ namespace gbe
alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
}
+ void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister src0) {
+ union { double d; unsigned u[2]; } u;
+ u.d = src0.value.df;
+ GenRegister r = GenRegister::retype(dest, GEN_TYPE_UD);
+ push();
+ curr.execWidth = 1;
+ MOV(r, GenRegister::immud(u.u[1]));
+ MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+ pop();
+ r.type = GEN_TYPE_DF;
+ r.vstride = GEN_VERTICAL_STRIDE_0;
+ r.width = GEN_WIDTH_1;
+ r.hstride = GEN_HORIZONTAL_STRIDE_0;
+ push();
+ MOV(dest, r);
+ pop();
+ }
+
+ void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+ int w = curr.execWidth;
+ if (src0.isdf()) {
+ push();
+ curr.execWidth = 16;
+ MOV(dest, src0);
+ if (w == 16)
+ curr.quarterControl = 1;
+ MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
+ pop();
+ } else {
+ GenRegister r0 = GenRegister::h2(r);
+ push();
+ curr.execWidth = 8;
+ MOV(r0, src0);
+ MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
+ pop();
+ push();
+ curr.execWidth = 16;
+ MOV(dest, r);
+ pop();
+ if (w == 16) {
+ push();
+ curr.execWidth = 8;
+ MOV(r0, GenRegister::suboffset(src0, 8));
+ MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
+ pop();
+ push();
+ curr.execWidth = 16;
+ curr.quarterControl = 1;
+ MOV(GenRegister::suboffset(dest, 8), r);
+ pop();
+ }
+ }
+ }
+
ALU1(MOV)
ALU1(RNDZ)
ALU1(RNDE)
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index c98774f..5e5c9e1 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -113,6 +113,8 @@ namespace gbe
ALU2(LINE)
ALU2(PLN)
ALU3(MAD)
+ ALU1(LOAD_DF_IMM);
+ ALU2(MOV_DF);
#undef ALU1
#undef ALU2
#undef ALU3
@@ -132,6 +134,10 @@ namespace gbe
void NOP(void);
/*! Wait instruction (used for the barrier) */
void WAIT(void);
+ /*! Read 64-bits float arrays */
+ void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+ /*! Write 64-bits float arrays */
+ void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped read (upto 4 channels) */
void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 098d9ec..a3b4621 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -12,6 +12,8 @@ DECL_GEN7_SCHEDULE(Wait, 20, 2, 2)
DECL_GEN7_SCHEDULE(Math, 20, 4, 2)
DECL_GEN7_SCHEDULE(Barrier, 80, 1, 1)
DECL_GEN7_SCHEDULE(Fence, 80, 1, 1)
+DECL_GEN7_SCHEDULE(ReadFloat64, 80, 1, 1)
+DECL_GEN7_SCHEDULE(WriteFloat64, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedRead, 80, 1, 1)
DECL_GEN7_SCHEDULE(UntypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(ByteGather, 80, 1, 1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 4e7cebd..9fb21b9 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -129,6 +129,7 @@ namespace gbe
case TYPE_S32: return GEN_TYPE_D;
case TYPE_U32: return GEN_TYPE_UD;
case TYPE_FLOAT: return GEN_TYPE_F;
+ case TYPE_DOUBLE: return GEN_TYPE_DF;
default: NOT_SUPPORTED; return GEN_TYPE_F;
}
}
@@ -166,11 +167,13 @@ namespace gbe
bool SelectionInstruction::isRead(void) const {
return this->opcode == SEL_OP_UNTYPED_READ ||
+ this->opcode == SEL_OP_READ_FLOAT64 ||
this->opcode == SEL_OP_BYTE_GATHER;
}
bool SelectionInstruction::isWrite(void) const {
return this->opcode == SEL_OP_UNTYPED_WRITE ||
+ this->opcode == SEL_OP_WRITE_FLOAT64 ||
this->opcode == SEL_OP_BYTE_SCATTER;
}
@@ -406,6 +409,8 @@ namespace gbe
#define ALU3(OP) \
INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
ALU1(MOV)
+ ALU2(MOV_DF)
+ ALU1(LOAD_DF_IMM)
ALU1(RNDZ)
ALU1(RNDE)
ALU2(SEL)
@@ -449,6 +454,10 @@ namespace gbe
void NOP(void);
/*! Wait instruction (used for the barrier) */
void WAIT(void);
+ /*! Read 64 bits float array */
+ void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+ /*! Write 64 bits float array */
+ void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */
@@ -610,20 +619,25 @@ namespace gbe
ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID) {
SelectionBlock *block = insn->parent;
- const uint32_t simdWidth = ctx.getSimdWidth();
+ uint32_t simdWidth = ctx.getSimdWidth();
ir::Register tmp;
+ ir::RegisterFamily f = file.get(insn->dst(regID).reg()).family;
+ int genType = f == ir::FAMILY_QWORD ? GEN_TYPE_DF : GEN_TYPE_F;
+ GenRegister gr;
// This will append the temporary register in the instruction block
this->block = block;
- tmp = this->reg(ir::FAMILY_DWORD);
+ tmp = this->reg(f);
// Generate the MOV instruction and replace the register in the instruction
+ if (f == ir::FAMILY_QWORD && ctx.getSimdWidth() == 8)
+ simdWidth = 16;
SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
- mov->dst(0) = GenRegister::retype(insn->dst(regID), GEN_TYPE_F);
+ mov->dst(0) = GenRegister::retype(insn->dst(regID), genType);
mov->state = GenInstructionState(simdWidth);
- insn->dst(regID) = mov->src(0) = GenRegister::fxgrf(simdWidth, tmp);
+ gr = f == ir::FAMILY_QWORD ? GenRegister::dfxgrf(simdWidth, tmp) : GenRegister::fxgrf(simdWidth, tmp);
+ insn->dst(regID) = mov->src(0) = gr;
insn->append(*mov);
-
return tmp;
}
@@ -657,6 +671,7 @@ namespace gbe
case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+ case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
default: NOT_SUPPORTED;
}
GBE_ASSERT(false);
@@ -719,6 +734,33 @@ namespace gbe
void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+ void Selection::Opaque::READ_FLOAT64(Reg addr,
+ const GenRegister *dst,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+ SelectionVector *srcVector = this->appendVector();
+ SelectionVector *dstVector = this->appendVector();
+
+ // Regular instruction to encode
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
+ insn->src(0) = addr;
+ insn->extra.function = bti;
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation
+ dstVector->regNum = elemNum;
+ dstVector->isSrc = 0;
+ dstVector->reg = &insn->dst(0);
+
+ // Source cannot be scalar (yet)
+ srcVector->regNum = 1;
+ srcVector->isSrc = 1;
+ srcVector->reg = &insn->src(0);
+ }
+
void Selection::Opaque::UNTYPED_READ(Reg addr,
const GenRegister *dst,
uint32_t elemNum,
@@ -746,6 +788,27 @@ namespace gbe
srcVector->reg = &insn->src(0);
}
+ void Selection::Opaque::WRITE_FLOAT64(Reg addr,
+ const GenRegister *src,
+ uint32_t elemNum,
+ uint32_t bti)
+ {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1);
+ SelectionVector *vector = this->appendVector();
+
+ // Regular instruction to encode
+ insn->src(0) = addr;
+ for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+ insn->src(elemID+1) = src[elemID];
+ insn->extra.function = bti;
+ insn->extra.elem = elemNum;
+
+ // Sends require contiguous allocation for the sources
+ vector->regNum = elemNum+1;
+ vector->reg = &insn->src(0);
+ vector->isSrc = 1;
+ }
+
void Selection::Opaque::UNTYPED_WRITE(Reg addr,
const GenRegister *src,
uint32_t elemNum,
@@ -1092,6 +1155,15 @@ namespace gbe
// Implementation of all patterns
///////////////////////////////////////////////////////////////////////////
+ bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+ using namespace ir;
+ const auto &childInsn = cast<LoadImmInstruction>(insn);
+ const auto &imm = childInsn.getImmediate();
+ if(imm.type != TYPE_DOUBLE)
+ return true;
+ return false;
+ }
+
GenRegister getRegisterFromImmediate(ir::Immediate imm)
{
using namespace ir;
@@ -1103,6 +1175,7 @@ namespace gbe
case TYPE_S16: return GenRegister::immw(imm.data.s16);
case TYPE_U8: return GenRegister::immuw(imm.data.u8);
case TYPE_S8: return GenRegister::immw(imm.data.s8);
+ case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
default: NOT_SUPPORTED; return GenRegister::immuw(0);
}
}
@@ -1146,7 +1219,13 @@ namespace gbe
const GenRegister src = sel.selReg(insn.getSrc(0));
switch (opcode) {
case ir::OP_ABS: sel.MOV(dst, GenRegister::abs(src)); break;
- case ir::OP_MOV: sel.MOV(dst, src); break;
+ case ir::OP_MOV:
+ if (dst.isdf()) {
+ ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+ sel.MOV_DF(dst, src, sel.selReg(r));
+ } else
+ sel.MOV(dst, src);
+ break;
case ir::OP_RNDD: sel.RNDD(dst, src); break;
case ir::OP_RNDE: sel.RNDE(dst, src); break;
case ir::OP_RNDU: sel.RNDU(dst, src); break;
@@ -1225,14 +1304,14 @@ namespace gbe
SelectionDAG *dag1 = dag.child[1];
// Right source can always be an immediate
- if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+ if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
src0 = sel.selReg(insn.getSrc(0), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
if (dag0) dag0->isRoot = 1;
}
// Left source cannot be immediate but it is OK if we can commute
- else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI) {
+ else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
src0 = sel.selReg(insn.getSrc(1), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1268,7 +1347,7 @@ namespace gbe
case OP_SHR: sel.SHR(dst, src0, src1); break;
case OP_ASR: sel.ASR(dst, src0, src1); break;
case OP_MUL:
- if (type == TYPE_FLOAT)
+ if (type == TYPE_FLOAT || type == TYPE_DOUBLE)
sel.MUL(dst, src0, src1);
else if (type == TYPE_U32 || type == TYPE_S32) {
sel.pop();
@@ -1599,6 +1678,7 @@ namespace gbe
case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.data.s16)); break;
case TYPE_U8: sel.MOV(dst, GenRegister::immuw(imm.data.u8)); break;
case TYPE_S8: sel.MOV(dst, GenRegister::immw(imm.data.s8)); break;
+ case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.data.f64)); break;
default: NOT_SUPPORTED;
}
sel.pop();
@@ -1650,6 +1730,8 @@ namespace gbe
INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
using namespace ir;
switch (type) {
+ case TYPE_DOUBLE:
+ return GEN_BYTE_SCATTER_QWORD;
case TYPE_FLOAT:
case TYPE_U32:
case TYPE_S32:
@@ -1681,6 +1763,22 @@ namespace gbe
sel.UNTYPED_READ(addr, dst.data(), valueNum, bti);
}
+ void emitReadFloat64(Selection::Opaque &sel,
+ const ir::LoadInstruction &insn,
+ GenRegister addr,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ vector<GenRegister> dst(valueNum);
+ for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+ dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
+ dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ if (sel.ctx.getSimdWidth() == 16)
+ dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+ }
+
void emitByteGather(Selection::Opaque &sel,
const ir::LoadInstruction &insn,
const uint32_t elemSize,
@@ -1732,6 +1830,8 @@ namespace gbe
const uint32_t elemSize = getByteScatterGatherSize(type);
if (insn.getAddressSpace() == MEM_CONSTANT)
this->emitIndirectMove(sel, insn, address);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitReadFloat64(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedRead(sel, insn, address, space == MEM_LOCAL ? 0xfe : 0x00);
else {
@@ -1762,6 +1862,25 @@ namespace gbe
sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
}
+ void emitWriteFloat64(Selection::Opaque &sel,
+ const ir::StoreInstruction &insn,
+ uint32_t bti) const
+ {
+ using namespace ir;
+ const uint32_t valueNum = insn.getValueNum();
+ const uint32_t addrID = ir::StoreInstruction::addressIndex;
+ GenRegister addr;
+ vector<GenRegister> value(valueNum);
+
+ addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
+ for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+ value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+ value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ if (sel.ctx.getSimdWidth() == 16)
+ value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
+ sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+ }
+
void emitByteScatter(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
const uint32_t elemSize,
@@ -1791,7 +1910,9 @@ namespace gbe
const uint32_t bti = space == MEM_LOCAL ? 0xfe : 0x01;
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
- if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitWriteFloat64(sel, insn, bti);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
this->emitUntypedWrite(sel, insn, bti);
else {
const GenRegister address = sel.selReg(insn.getAddress());
@@ -1839,7 +1960,7 @@ namespace gbe
SelectionDAG *dag1 = dag.child[1];
// Right source can always be an immediate
- if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI) {
+ if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
src0 = sel.selReg(insn.getSrc(0), type);
src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1873,7 +1994,7 @@ namespace gbe
const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
// We need two instructions to make the conversion
- if (dstFamily != FAMILY_DWORD && srcFamily == FAMILY_DWORD) {
+ if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && srcFamily == FAMILY_DWORD) {
GenRegister unpacked;
if (dstFamily == FAMILY_WORD) {
const uint32_t type = TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -1886,6 +2007,9 @@ namespace gbe
}
sel.MOV(unpacked, src);
sel.MOV(dst, unpacked);
+ } else if (dst.isdf()) {
+ ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+ sel.MOV_DF(dst, src, sel.selReg(r));
} else
sel.MOV(dst, src);
return true;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 789c81c..167af09 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,7 @@
DECL_SELECTION_IR(LABEL, LabelInstruction)
DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, BinaryInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, UnaryInstruction)
DECL_SELECTION_IR(NOT, UnaryInstruction)
DECL_SELECTION_IR(LZD, UnaryInstruction)
DECL_SELECTION_IR(RNDZ, UnaryInstruction)
@@ -32,6 +34,8 @@ DECL_SELECTION_IR(BARRIER, BarrierInstruction)
DECL_SELECTION_IR(FENCE, FenceInstruction)
DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ_FLOAT64, ReadFloat64Instruction)
+DECL_SELECTION_IR(WRITE_FLOAT64, WriteFloat64Instruction)
DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 9765b02..e7c96ac 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -458,7 +458,6 @@ namespace gbe
}
bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
-
// Perform the linear scan allocator
const uint32_t regNum = ctx.sel->getRegNum();
for (uint32_t startID = 0; startID < regNum; ++startID) {
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d772b0d..ef0a2e4 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -69,11 +69,12 @@ namespace gbe
/*! Type size in bytes for each Gen type */
INLINE int typeSize(uint32_t type) {
switch(type) {
+ case GEN_TYPE_DF:
+ return 8;
case GEN_TYPE_UD:
case GEN_TYPE_D:
case GEN_TYPE_F:
return 4;
- case GEN_TYPE_HF:
case GEN_TYPE_UW:
case GEN_TYPE_W:
return 2;
@@ -192,6 +193,7 @@ namespace gbe
/*! For immediates or virtual register */
union {
+ double df;
float f;
int32_t d;
uint32_t ud;
@@ -211,6 +213,31 @@ namespace gbe
uint32_t quarter:1; //!< To choose which part we want (Q1 / Q2)
uint32_t address_mode:1; //!< direct or indirect
+ static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+ GenRegister r = reg;
+ r.nr += nr;
+ r.subnr += subnr;
+ return r;
+ }
+
+ INLINE bool isimmdf(void) const {
+ if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+ return true;
+ return false;
+ }
+
+ INLINE bool isdf(void) const {
+ if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+ return true;
+ return false;
+ }
+
+ static INLINE GenRegister h2(GenRegister reg) {
+ GenRegister r = reg;
+ r.hstride = GEN_HORIZONTAL_STRIDE_2;
+ return r;
+ }
+
static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
GBE_ASSERT(reg.physical == 0);
if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
@@ -293,6 +320,18 @@ namespace gbe
return reg;
}
+ static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+ return retype(vec16(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+ return retype(vec8(file, reg), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+ return retype(vec1(file, reg), GEN_TYPE_DF);
+ }
+
static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
return retype(vec16(file, reg), GEN_TYPE_UD);
}
@@ -371,6 +410,12 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_0);
}
+ static INLINE GenRegister immdf(double df) {
+ GenRegister immediate = imm(GEN_TYPE_DF);
+ immediate.value.df = df;
+ return immediate;
+ }
+
static INLINE GenRegister immf(float f) {
GenRegister immediate = imm(GEN_TYPE_F);
immediate.value.f = f;
@@ -448,6 +493,18 @@ namespace gbe
return vec16(GEN_GENERAL_REGISTER_FILE, reg);
}
+ static INLINE GenRegister df1grf(ir::Register reg) {
+ return df1(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df8grf(ir::Register reg) {
+ return df8(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
+ static INLINE GenRegister df16grf(ir::Register reg) {
+ return df16(GEN_GENERAL_REGISTER_FILE, reg);
+ }
+
static INLINE GenRegister ud16grf(ir::Register reg) {
return ud16(GEN_GENERAL_REGISTER_FILE, reg);
}
@@ -609,10 +666,26 @@ namespace gbe
}
static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
- reg.subnr += delta * typeSize(reg.type);
+ if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+ reg.subnr += delta * typeSize(reg.type);
+ reg.nr += reg.subnr / 32;
+ reg.subnr %= 32;
+ }
return reg;
}
+ static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+ }
+
+ static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+ return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+ }
+
static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
}
@@ -685,6 +758,18 @@ namespace gbe
return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
+ static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+ return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+ return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
+ static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+ return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+ }
+
static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
}
@@ -790,6 +875,7 @@ namespace gbe
return SIMD1(values...); \
} \
}
+ DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
--
1.7.11.7
More information about the Beignet
mailing list