[Beignet] [PATCH] GBE: refactor double support.
Xing, Homer
homer.xing at intel.com
Mon Aug 5 00:30:52 PDT 2013
You turned off predicate and mask in 64-bit reading/writing. And use SIMD16 data port reading for SIMD8 read. Good idea.
This patch looks good to me.
Homer
-----Original Message-----
From: beignet-bounces+homer.xing=intel.com at lists.freedesktop.org [mailto:beignet-bounces+homer.xing=intel.com at lists.freedesktop.org] On Behalf Of Zhigang Gong
Sent: Monday, August 5, 2013 11:08 AM
To: beignet at lists.freedesktop.org
Cc: Zhigang Gong
Subject: [Beignet] [PATCH] GBE: refactor double support.
There are two major issues in double support:
1. Doesn't work at SIMD16 mode.
2. The incorrect usage of vectors. We only need to allocate those temporary register to contiguous registers.
If you look at the previous implementation of
READ_FLOAT64/WRITE_FLOAT64 in gen_encoder.cpp. You can easily find it contains many duplicate code and considering the SIMD16 code path never work correctly, it's so difficult to work based on that code. So I choose to refactor those two major functions.
And refine other parts in the instruction selection stage to fix the above two major problem with a cleaner code.
Now, it works well on both SIMD16/SIMD8 mode.
Another minor improvement is for the READ_FLOAT64 on SIMD8 mode, this patch saves one time of send instruction to read all the
8 double data into registers.
Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
---
backend/src/backend/gen_context.cpp | 21 ++-
backend/src/backend/gen_encoder.cpp | 231 +++++++++++++---------------
backend/src/backend/gen_encoder.hpp | 7 +-
backend/src/backend/gen_insn_selection.cpp | 64 +++++---
backend/src/backend/gen_reg_allocation.cpp | 10 +-
backend/src/backend/gen_register.hpp | 25 +++
backend/src/llvm/llvm_gen_backend.cpp | 4 +-
7 files changed, 200 insertions(+), 162 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index e33d8da..655b1d7 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -354,12 +354,18 @@ namespace gbe
p->pop();
}
+ // For SIMD8, we allocate 2*elemNum temporary registers from dst(0),
+ and // then follow the real destination registers.
+ // For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitReadFloat64Instruction(const SelectionInstruction &insn) {
- const GenRegister dst = ra->genReg(insn.dst(0));
+ const uint32_t elemNum = insn.extra.elem;
+ const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+ const GenRegister dst = ra->genReg(insn.dst(tmpRegSize));
+ const GenRegister tmp = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister tempAddr = ra->genReg(insn.src(1));
const uint32_t bti = insn.extra.function;
- const uint32_t elemNum = insn.extra.elem;
- p->READ_FLOAT64(dst, src, bti, elemNum);
+ p->READ_FLOAT64(dst, tmp, tempAddr, src, bti, elemNum);
}
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) { @@ -370,11 +376,16 @@ namespace gbe
p->UNTYPED_READ(dst, src, bti, elemNum);
}
+ // For SIMD8, we allocate 2*elemNum temporary registers from dst(0),
+ and // then follow the real destination registers.
+ // For SIMD16, we allocate elemNum temporary registers from dst(0).
void GenContext::emitWriteFloat64Instruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.extra.function;
const uint32_t elemNum = insn.extra.elem;
- p->WRITE_FLOAT64(src, bti, elemNum);
+ const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : elemNum;
+ const GenRegister data = ra->genReg(insn.src(tmpRegSize + 1));
+ const uint32_t bti = insn.extra.function;
+ p->WRITE_FLOAT64(src, data, bti, elemNum);
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) { diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index f84c6dd..5930926 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -356,103 +356,69 @@ namespace gbe
0
};
- static int dst_type(int exec_width) {
- if (exec_width == 8)
- return GEN_TYPE_UD;
- if (exec_width == 16)
- return GEN_TYPE_UW;
- NOT_IMPLEMENTED;
- return 0;
- }
-
- void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
- int w = curr.execWidth;
- dst = GenRegister::h2(dst);
- dst.type = GEN_TYPE_UD;
- src.type = GEN_TYPE_UD;
- GenRegister r = GenRegister::retype(GenRegister::suboffset(src, w*2), GEN_TYPE_UD);
- GenRegister imm4 = GenRegister::immud(4);
- GenInstruction *insn;
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::uw16grf(r.nr, 0));
- setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
- push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(dst, r);
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(dst, w), GenRegister::suboffset(r, w / 2));
- pop();
- ADD(src, src, imm4);
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::uw16grf(r.nr, 0));
- setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_READ, curr.execWidth / 8, curr.execWidth / 8);
+ void GenEncoder::READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister addr, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD);
+ src = GenRegister::retype(src, GEN_TYPE_UD);
+ addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ tmp = GenRegister::retype(tmp, GEN_TYPE_UD);
+ uint32_t originSimdWidth = curr.execWidth;
+ uint32_t originPredicate = curr.predicate;
+ uint32_t originMask = curr.noMask;
push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(GenRegister::suboffset(dst, 1), r);
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(dst, w + 1), GenRegister::suboffset(r, w / 2));
+ for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1;
+ channels < originSimdWidth; channels += 8, currQuarter++) {
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = GEN_MASK_DISABLE;
+ curr.execWidth = 8;
+ /* XXX The following instruction is illegal, but it works as SIMD 1*4 mode
+ which is what we want here. */
+ MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels));
+ ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), GenRegister::suboffset(src, channels), GenRegister::immd(4));
+ MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), GenRegister::suboffset(src, channels + 4));
+ ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), GenRegister::suboffset(src, channels + 4), GenRegister::immd(4));
+ // get the first 8 doubles.
+ curr.execWidth = 16;
+ this->UNTYPED_READ(tmp, addr, bti, elemNum);
+ if (originSimdWidth == 16)
+ curr.quarterControl = currQuarter;
+ curr.predicate = originPredicate;
+ curr.noMask = originMask;
+ // Back to simd8 for correct predication flag.
+ curr.execWidth = 8;
+ MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF));
+ }
pop();
}
- void GenEncoder::WRITE_FLOAT64(GenRegister msg, uint32_t bti, uint32_t elemNum) {
- int w = curr.execWidth;
- GenRegister r = GenRegister::retype(GenRegister::suboffset(msg, w*3), GEN_TYPE_UD);
- r.type = GEN_TYPE_UD;
- GenRegister hdr = GenRegister::h2(r);
- GenRegister src = GenRegister::ud16grf(msg.nr + w / 8, 0);
- src.hstride = GEN_HORIZONTAL_STRIDE_2;
- GenRegister data = GenRegister::offset(r, w / 8);
- GenRegister imm4 = GenRegister::immud(4);
- MOV(r, GenRegister::ud8grf(msg.nr, 0));
+ void GenEncoder::WRITE_FLOAT64(GenRegister msg, GenRegister data, uint32_t bti, uint32_t elemNum) {
+ GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD);
+ msg = GenRegister::retype(msg, GEN_TYPE_UD);
+ int originSimdWidth = curr.execWidth;
+ int originPredicate = curr.predicate;
+ int originMask = curr.noMask;
push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(data, src);
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w));
- pop();
- GenInstruction *insn;
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
- setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
-
- ADD(r, GenRegister::ud8grf(msg.nr, 0), imm4);
- push();
- curr.quarterControl = 0;
- curr.nibControl = 0;
- MOV(data, GenRegister::suboffset(src, 1));
- if (w == 8)
- curr.nibControl = 1;
- else
- curr.quarterControl = 1;
- MOV(GenRegister::suboffset(data, w / 2), GenRegister::suboffset(src, w + 1));
+ for (uint32_t half = 0; half < 2; half++) {
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = GEN_MASK_DISABLE;
+ curr.execWidth = 8;
+ MOV(GenRegister::suboffset(msg, originSimdWidth), GenRegister::unpacked_ud(data32.nr, data32.subnr + half));
+ if (originSimdWidth == 16) {
+ MOV(GenRegister::suboffset(msg, originSimdWidth + 8), GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + half));
+ curr.execWidth = 16;
+ }
+ if (half == 1)
+ ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::immd(4));
+ curr.predicate = originPredicate;
+ curr.noMask = originMask;
+ this->UNTYPED_WRITE(msg, bti, elemNum);
+ }
+ /* Let's restore the original message(addr) register. */
+ /* XXX could be optimized if we don't allocate the address to the header
+ position of the message. */
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = GEN_MASK_DISABLE;
+ ADD(msg, GenRegister::retype(msg, GEN_TYPE_UD),
+ GenRegister::immd(-4));
pop();
- insn = next(GEN_OPCODE_SEND);
- setHeader(insn);
- setDst(insn, GenRegister::retype(GenRegister::null(), dst_type(curr.execWidth)));
- setSrc0(insn, GenRegister::ud8grf(hdr.nr, 0));
- setSrc1(insn, GenRegister::immud(0));
- setDPUntypedRW(this, insn, bti, untypedRWMask[1], GEN_UNTYPED_WRITE, curr.execWidth / 4, 0);
}
void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) { @@ -470,7 +436,7 @@ namespace gbe
NOT_IMPLEMENTED;
this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(this,
@@ -601,25 +567,53 @@ namespace gbe
return &this->store.back();
}
- INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
- if (dst.isdf() && src.isdf()) {
+ INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1 =
+ GenRegister::null()) {
int w = p->curr.execWidth;
p->push();
- p->curr.quarterControl = 0;
p->curr.nibControl = 0;
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, dst);
- p->setSrc0(insn, src);
+ p->setSrc0(insn, src0);
+ if (!GenRegister::isNull(src1))
+ p->setSrc1(insn, src1);
if (w == 8)
p->curr.nibControl = 1; // second 1/8 mask
- else // w == 16
- p->curr.quarterControl = 1; // second 1/4 mask
insn = p->next(opcode);
p->setHeader(insn);
p->setDst(insn, GenRegister::suboffset(dst, w / 2));
- p->setSrc0(insn, GenRegister::suboffset(src, w / 2));
+ p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+ if (!GenRegister::isNull(src1))
+ p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
p->pop();
+ }
+
+ // Double register accessing is a little special, // Per Gen spec,
+ then only supported mode is SIMD8 and, it only // handles four
+ doubles each time.
+ // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8 //
+ to two SIMD1x4.
+ INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+ GenRegister src0, GenRegister src1 = GenRegister::null()) {
+ if (p->curr.execWidth == 8)
+ _handleDouble(p, opcode, dst, src0, src1);
+ else if (p->curr.execWidth == 16) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.quarterControl = GEN_COMPRESSION_Q1;
+ _handleDouble(p, opcode, dst, src0, src1);
+ p->curr.quarterControl = GEN_COMPRESSION_Q2;
+ if (!GenRegister::isNull(src1))
+ src1 = GenRegister::offset(src1, 2);
+ _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
+ p->pop();
+ }
+ }
+
+ INLINE void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src) {
+ if (dst.isdf() && src.isdf()) {
+ handleDouble(p, opcode, dst, src);
} else if (needToSplitAlu1(p, dst, src) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
@@ -653,25 +647,7 @@ namespace gbe
GenRegister src1)
{
if (dst.isdf() && src0.isdf() && src1.isdf()) {
- int w = p->curr.execWidth;
- p->push();
- p->curr.quarterControl = 0;
- p->curr.nibControl = 0;
- GenInstruction *insn = p->next(opcode);
- p->setHeader(insn);
- p->setDst(insn, dst);
- p->setSrc0(insn, src0);
- p->setSrc1(insn, src1);
- if (w == 8)
- p->curr.nibControl = 1; // second 1/8 mask
- else // w == 16
- p->curr.quarterControl = 1; // second 1/4 mask
- insn = p->next(opcode);
- p->setHeader(insn);
- p->setDst(insn, GenRegister::suboffset(dst, w / 2));
- p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
- p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
- p->pop();
+ handleDouble(p, opcode, dst, src0, src1);
} else if (needToSplitAlu2(p, dst, src0, src1) == false) {
GenInstruction *insn = p->next(opcode);
p->setHeader(insn);
@@ -808,7 +784,16 @@ namespace gbe
r.width = GEN_WIDTH_1;
r.hstride = GEN_HORIZONTAL_STRIDE_0;
push();
+ uint32_t width = curr.execWidth;
+ curr.execWidth = 8;
+ curr.predicate = GEN_PREDICATE_NONE;
+ curr.noMask = 1;
+ curr.quarterControl = GEN_COMPRESSION_Q1;
MOV(dest, r);
+ if (width == 16) {
+ curr.quarterControl = GEN_COMPRESSION_Q2;
+ MOV(GenRegister::offset(dest, 2), r);
+ }
pop();
}
@@ -839,14 +824,8 @@ namespace gbe
void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
int w = curr.execWidth;
if (src0.isdf()) {
- push();
- curr.execWidth = 16;
- MOV(dest, src0);
- if (w == 16) {
- curr.quarterControl = 1;
- MOV(GenRegister::QnPhysical(dest, w / 4), GenRegister::QnPhysical(src0, w / 4));
- }
- pop();
+ GBE_ASSERT(0); // MOV DF is called from convert instruction,
+ // We should never convert a df to a df.
} else {
GenRegister r0 = GenRegister::h2(r);
push();
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index d3a7165..86e1a71 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -118,10 +118,11 @@ namespace gbe
ALU2(LINE)
ALU2(PLN)
ALU3(MAD)
- ALU2(MOV_DF);
+ //ALU2(MOV_DF);
#undef ALU1
#undef ALU2
#undef ALU3
+ void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp =
+ GenRegister::null());
void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
/*! Barrier message (to synchronize threads of a workgroup) */
void BARRIER(GenRegister src);
@@ -142,9 +143,9 @@ namespace gbe
/*! Atomic instructions */
void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
/*! Read 64-bits float arrays */
- void READ_FLOAT64(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+ void READ_FLOAT64(GenRegister dst, GenRegister tmp, GenRegister
+ addr, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Write 64-bits float arrays */
- void WRITE_FLOAT64(GenRegister src, uint32_t bti, uint32_t elemNum);
+ void WRITE_FLOAT64(GenRegister src, GenRegister data, uint32_t bti,
+ uint32_t elemNum);
/*! Untyped read (upto 4 channels) */
void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */ diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d4be8bf..727d07d 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -466,9 +466,9 @@ namespace gbe
/*! Atomic instruction */
void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
/*! Read 64 bits float array */
- void READ_FLOAT64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+ void READ_FLOAT64(Reg addr, Reg tempAddr, const GenRegister *dst,
+ uint32_t elemNum, uint32_t valueNum, uint32_t bti);
/*! Write 64 bits float array */
- void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+ void WRITE_FLOAT64(Reg addr, const GenRegister *src, uint32_t
+ elemNum, uint32_t valueNum, uint32_t bti);
/*! Untyped read (up to 4 elements) */
void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
/*! Untyped write (up to 4 elements) */ @@ -760,12 +760,16 @@ namespace gbe
void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+ /* elemNum contains all the temporary register and the
+ real destination registers.*/
void Selection::Opaque::READ_FLOAT64(Reg addr,
+ Reg tempAddr,
const GenRegister *dst,
uint32_t elemNum,
+ uint32_t valueNum,
uint32_t bti)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64, elemNum, 1);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_READ_FLOAT64,
+ elemNum, 2);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
@@ -773,11 +777,12 @@ namespace gbe
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID) = dst[elemID];
insn->src(0) = addr;
+ insn->src(1) = tempAddr;
insn->extra.function = bti;
- insn->extra.elem = elemNum;
+ insn->extra.elem = valueNum;
- // Sends require contiguous allocation
- dstVector->regNum = elemNum;
+ // Only the temporary registers need contiguous allocation
+ dstVector->regNum = elemNum - valueNum;
dstVector->isSrc = 0;
dstVector->reg = &insn->dst(0);
@@ -814,9 +819,12 @@ namespace gbe
srcVector->reg = &insn->src(0);
}
+ /* elemNum contains all the temporary register and the
+ real data registers.*/
void Selection::Opaque::WRITE_FLOAT64(Reg addr,
const GenRegister *src,
uint32_t elemNum,
+ uint32_t valueNum,
uint32_t bti)
{
SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE_FLOAT64, 0, elemNum+1); @@ -827,10 +835,10 @@ namespace gbe
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->src(elemID+1) = src[elemID];
insn->extra.function = bti;
- insn->extra.elem = elemNum;
+ insn->extra.elem = valueNum;
- // Sends require contiguous allocation for the sources
- vector->regNum = elemNum+1;
+ // Only the addr + temporary registers need to be contiguous.
+ vector->regNum = (elemNum - valueNum) + 1;
vector->reg = &insn->src(0);
vector->isSrc = 1;
}
@@ -1871,13 +1879,17 @@ namespace gbe
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
- vector<GenRegister> dst(valueNum);
- for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
- dst[dstID] = GenRegister::retype(sel.selReg(insn.getValue(dstID)), GEN_TYPE_F);
- dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- if (sel.ctx.getSimdWidth() == 16)
- dst.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- sel.READ_FLOAT64(addr, dst.data(), dst.size(), bti);
+ GenRegister dst[valueNum * 3];
+ uint32_t dstID;
+ /* XXX support scalar only right now. */
+ GBE_ASSERT(valueNum == 1);
+ uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+ // The first 16 DWORD register space is for temporary usage at encode stage.
+ for (dstID = 0; dstID < tmpRegNum ; ++dstID)
+ dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD));
+ for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID)
+ dst[dstID] = sel.selReg(insn.getValue(valueID));
+ sel.READ_FLOAT64(addr, sel.selReg(sel.reg(FAMILY_QWORD)), dst,
+ valueNum + tmpRegNum, valueNum, bti);
}
void emitByteGather(Selection::Opaque &sel, @@ -1971,15 +1983,19 @@ namespace gbe
const uint32_t valueNum = insn.getValueNum();
const uint32_t addrID = ir::StoreInstruction::addressIndex;
GenRegister addr;
- vector<GenRegister> value(valueNum);
-
+ GenRegister src[valueNum * 2];
+ uint32_t srcID;
+ /* XXX support scalar only right now. */
+ GBE_ASSERT(valueNum == 1);
addr = GenRegister::retype(sel.selReg(insn.getSrc(addrID)), GEN_TYPE_F);
- for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
- value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
- value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- if (sel.ctx.getSimdWidth() == 16)
- value.push_back(sel.selReg(sel.reg(FAMILY_QWORD)));
- sel.WRITE_FLOAT64(addr, value.data(), value.size(), bti);
+ uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : valueNum;
+ // The first 16 DWORD register space is for temporary usage at encode stage.
+ for (srcID = 0; srcID < tmpRegNum; ++srcID)
+ src[srcID] = sel.selReg(sel.reg(FAMILY_DWORD));
+
+ for (uint32_t valueID = 0; valueID < valueNum; ++srcID, ++valueID)
+ src[srcID] = sel.selReg(insn.getValue(valueID));
+ sel.WRITE_FLOAT64(addr, src, valueNum + tmpRegNum, valueNum,
+ bti);
}
void emitByteScatter(Selection::Opaque &sel, diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index e7c96ac..4ba03ea 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -474,7 +474,12 @@ namespace gbe
if (it != vectorMap.end()) {
const SelectionVector *vector = it->second.first;
const uint32_t simdWidth = ctx.getSimdWidth();
- const uint32_t alignment = simdWidth * sizeof(uint32_t);
+
+ const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
+ const ir::RegisterFamily family = regData.family;
+ const uint32_t typeSize = familyVectorSize[family];
+ const uint32_t alignment = simdWidth*typeSize;
+
const uint32_t size = vector->regNum * alignment;
uint32_t grfOffset;
while ((grfOffset = ctx.allocate(size, alignment)) == 0) { @@ -483,7 +488,8 @@ namespace gbe
}
for (uint32_t regID = 0; regID < vector->regNum; ++regID, grfOffset += alignment) {
const ir::Register reg = vector->reg[regID].reg();
- GBE_ASSERT(RA.contains(reg) == false);
+ GBE_ASSERT(RA.contains(reg) == false
+ && ctx.sel->getRegisterData(reg).family ==
+ family);
RA.insert(std::make_pair(reg, grfOffset));
}
}
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index fedb743..7e48837 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -553,6 +553,11 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_1);
}
+ static INLINE bool isNull(GenRegister reg) {
+ return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
+ && reg.nr == GEN_ARF_NULL);
+ }
+
static INLINE GenRegister acc(void) {
return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
GEN_ARF_ACCUMULATOR, @@ -832,6 +837,26 @@ namespace gbe
GEN_HORIZONTAL_STRIDE_2);
}
+ static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_1);
+ }
+
+ static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
+ return GenRegister(GEN_GENERAL_REGISTER_FILE,
+ nr,
+ subnr,
+ GEN_TYPE_UD,
+ GEN_VERTICAL_STRIDE_8,
+ GEN_WIDTH_4,
+ GEN_HORIZONTAL_STRIDE_2);
+ }
+
static INLINE GenRegister mask(uint32_t subnr) {
return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index c8c5484..b5963ad 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2371,8 +2371,8 @@ namespace gbe
// Scalar is easy. We neednot build register tuples
if (isScalarType(llvmType) == true) {
const ir::Type type = getType(ctx, llvmType);
- if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
- OCL_SIMD_WIDTH = 8;
+ //if(type == ir::TYPE_DOUBLE) // 64bit-float load(store) don't support SIMD16
+ // OCL_SIMD_WIDTH = 8;
const ir::Register values = this->getRegister(llvmValues);
if (isLoad)
ctx.LOAD(type, ptr, addrSpace, dwAligned, values);
--
1.7.9.5
_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list