[Beignet] [PATCH 2/4] GBE: Optimize write_image instruction for simd8 mode.
Zhigang Gong
zhigang.gong at intel.com
Tue Feb 18 16:38:13 PST 2014
On simd8 mode, we can put the u,v,w,x,r,g,b,a to
a selection vector directly and don't need to
assign those values again.
Let's see an example, the following code is generated without this
patch which is doing a simple image copy:
(26 ) (+f0) mov(8) g113<1>F g114<8,8,1>D { align1 WE_normal 1Q };
(28 ) (+f0) send(8) g108<1>UD g112<8,8,1>F
sampler (3, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
(30 ) mov(8) g99<1>UD 0x0UD { align1 WE_all 1Q };
(32 ) mov(1) g99.7<1>UD 0xffffUD { align1 WE_all };
(34 ) mov(8) g103<1>UD 0x0UD { align1 WE_all 1Q };
(36 ) (+f0) mov(8) g100<1>UD g117<8,8,1>UD { align1 WE_normal 1Q };
(38 ) (+f0) mov(8) g101<1>UD g114<8,8,1>UD { align1 WE_normal 1Q };
(40 ) (+f0) mov(8) g104<1>UD g108<8,8,1>UD { align1 WE_normal 1Q };
(42 ) (+f0) mov(8) g105<1>UD g109<8,8,1>UD { align1 WE_normal 1Q };
(44 ) (+f0) mov(8) g106<1>UD g110<8,8,1>UD { align1 WE_normal 1Q };
(46 ) (+f0) mov(8) g107<1>UD g111<8,8,1>UD { align1 WE_normal 1Q };
(48 ) (+f0) send(8) null g99<8,8,1>UD
renderunsupported target 5 mlen 9 rlen 0 { align1 WE_normal 1Q };
(50 ) (+f0) mov(8) g1<1>UW 0x1UW { align1 WE_normal 1Q };
L1:
(52 ) mov(8) g112<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
(54 ) send(8) null g112<8,8,1>UD
thread_spawnerunsupported target 7 mlen 1 rlen 0 { align1 WE_normal 1Q EOT };
With this patch, we can optimize it as below:
(26 ) (+f0) mov(8) g106<1>F g111<8,8,1>D { align1 WE_normal 1Q };
(28 ) (+f0) send(8) g114<1>UD g105<8,8,1>F
sampler (3, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
(30 ) mov(8) g109<1>UD 0x0UD { align1 WE_all 1Q };
(32 ) mov(1) g109.7<1>UD 0xffffUD { align1 WE_all };
(34 ) mov(8) g113<1>UD 0x0UD { align1 WE_all 1Q };
(36 ) (+f0) send(8) null g109<8,8,1>UD
renderunsupported target 5 mlen 9 rlen 0 { align1 WE_normal 1Q };
(38 ) (+f0) mov(8) g1<1>UW 0x1UW { align1 WE_normal 1Q };
L1:
(40 ) mov(8) g112<1>UD g0<8,8,1>UD { align1 WE_all 1Q };
(42 ) send(8) null g112<8,8,1>UD
thread_spawnerunsupported target 7 mlen 1 rlen 0 { align1 WE_normal 1Q EOT };
This patch could save about 8 instructions per write_image.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/gen_context.cpp | 58 +-----------------
backend/src/backend/gen_insn_selection.cpp | 88 ++++++++++++++++++++++------
backend/src/ocl_stdlib.tmpl.h | 12 ++--
3 files changed, 77 insertions(+), 81 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index addf96e..351bf8e 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1786,9 +1786,7 @@ namespace gbe
const unsigned char sampler = insn.extra.sampler;
const unsigned int msgLen = insn.extra.rdmsglen;
uint32_t simdWidth = p->curr.execWidth;
- //p->push();
p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0);
- //p->pop();
}
void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
@@ -1824,60 +1822,8 @@ namespace gbe
void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
- const GenRegister ucoord = ra->genReg(insn.src(insn.extra.msglen));
- const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.msglen));
- const GenRegister R = ra->genReg(insn.src(3 + insn.extra.msglen));
- const GenRegister G = ra->genReg(insn.src(4 + insn.extra.msglen));
- const GenRegister B = ra->genReg(insn.src(5 + insn.extra.msglen));
- const GenRegister A = ra->genReg(insn.src(6 + insn.extra.msglen));
- const unsigned char bti = insn.extra.bti;
-
- p->push();
- uint32_t simdWidth = p->curr.execWidth;
- const uint32_t nr = header.nr;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- p->MOV(header, GenRegister::immud(0x0));
- p->curr.execWidth = 1;
-
- // prepare mesg desc and move to a0.0.
- // desc = bti | (msg_type << 14) | (header_present << 19))
- // prepare header, we need to enable all the 8 planes.
- p->MOV(GenRegister::ud8grf(nr, 7), GenRegister::immud(0xffff));
- p->curr.execWidth = 8;
- // Typed write only support SIMD8.
- // Prepare message payload U + V + R(ignored) + LOD(0) + RGBA.
- // Currently, we don't support non-zero lod, so we clear all lod to
- // zero for both quarters thus save one instruction here.
- // Thus we must put this instruction in noMask and no predication state.
- p->MOV(GenRegister::ud8grf(nr + 4, 0), GenRegister::immud(0)); //LOD
- p->pop();
- p->push();
- p->curr.execWidth = 8;
- // TYPED WRITE send instruction only support SIMD8, if we are SIMD16, we
- // need to call it twice.
- uint32_t quarterNum = (simdWidth == 8) ? 1 : 2;
-
- for( uint32_t quarter = 0; quarter < quarterNum; quarter++)
- {
-#define QUARTER_MOV0(dst_nr, src) p->MOV(GenRegister::ud8grf(dst_nr, 0), \
- GenRegister::retype(GenRegister::QnPhysical(src, quarter), src.type))
-#define QUARTER_MOV1(dst_nr, src) p->MOV(GenRegister::retype(GenRegister::ud8grf(dst_nr, 0), src.type), \
- GenRegister::retype(GenRegister::QnPhysical(src,quarter), src.type))
- if (quarter == 1)
- p->curr.quarterControl = GEN_COMPRESSION_Q2;
- QUARTER_MOV0(nr + 1, ucoord);
- QUARTER_MOV0(nr + 2, vcoord);
- if (insn.extra.is3DWrite)
- QUARTER_MOV0(nr + 3, ra->genReg(insn.src(2 + insn.extra.msglen)));
- QUARTER_MOV1(nr + 5, R);
- QUARTER_MOV1(nr + 6, G);
- QUARTER_MOV1(nr + 7, B);
- QUARTER_MOV1(nr + 8, A);
-#undef QUARTER_MOV
- p->TYPED_WRITE(header, true, bti);
- }
- p->pop();
+ const uint32_t bti = insn.extra.bti;
+ p->TYPED_WRITE(header, true, bti);
}
BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d76f580..697ed1a 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -545,7 +545,7 @@ namespace gbe
/*! Encode sample instructions */
void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
/*! Encode typed write instructions */
- void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
+ void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
/*! Get image information */
void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
/*! Multiply 64-bit integers */
@@ -1451,18 +1451,15 @@ namespace gbe
this->opaque = GBE_NEW(Selection::Opaque, ctx);
}
- void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
- GenRegister *msgs, uint32_t msgNum,
+ void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
uint32_t bti, bool is3D) {
uint32_t elemID = 0;
uint32_t i;
- SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
SelectionVector *msgVector = this->appendVector();;
for( i = 0; i < msgNum; ++i, ++elemID)
insn->src(elemID) = msgs[i];
- for (i = 0; i < srcNum; ++i, ++elemID)
- insn->src(elemID) = src[i];
insn->extra.bti = bti;
insn->extra.msglen = msgNum;
@@ -3036,24 +3033,77 @@ namespace gbe
{
using namespace ir;
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- uint32_t valueID;
GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
- GenRegister src[insn.getSrcNum()];
- uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
- uint32_t coordNum = 3;
+ const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
+ const uint32_t coordNum = 3;
- for(uint32_t i = 0; i < msgNum; i++)
- msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-
- // u, v, w coords should use coord type.
- for (valueID = 0; valueID < coordNum; ++valueID)
- src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType());
+ if (simdWidth == 16) {
+ for(uint32_t i = 0; i < msgNum; i++)
+ msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ } else {
+ uint32_t valueID = 0;
+ msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
+ msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+ // fake w.
+ if (!insn.is3D())
+ msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ // LOD.
+ msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+ for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
+ msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ }
- for (; valueID < insn.getSrcNum(); ++valueID)
- src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ sel.push();
+ sel.curr.predicate = GEN_PREDICATE_NONE;
+ sel.curr.noMask = 1;
+ sel.MOV(msgs[0], GenRegister::immud(0));
+ sel.curr.execWidth = 1;
+
+ GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4);
+ channelEn.subphysical = 1;
+ // Enable all channels.
+ sel.MOV(channelEn, GenRegister::immud(0xffff));
+ sel.curr.execWidth = 8;
+ // Set zero LOD.
+ if (simdWidth == 8)
+ sel.MOV(msgs[4], GenRegister::immud(0));
+ else
+ sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
+ sel.pop();
uint32_t bti = insn.getImageIndex();
- sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum, bti, insn.is3D());
+ if (simdWidth == 8)
+ sel.TYPED_WRITE(msgs, msgNum, bti, insn.is3D());
+ else {
+ sel.push();
+ sel.curr.execWidth = 8;
+ for( uint32_t quarter = 0; quarter < 2; quarter++)
+ {
+ #define QUARTER_MOV0(msgs, msgid, src) \
+ sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
+ GenRegister::Qn(src, quarter))
+
+ #define QUARTER_MOV1(msgs, msgid, src) \
+ sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
+ GenRegister::Qn(src, quarter))
+ sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+ // Set U,V,W
+ QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
+ QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
+ if (insn.is3D())
+ QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
+ // Set R, G, B, A
+ QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType()));
+ QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType()));
+ QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType()));
+ QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType()));
+ sel.TYPED_WRITE(msgs, msgNum, bti, insn.is3D());
+ #undef QUARTER_MOV0
+ #undef QUARTER_MOV1
+ }
+ sel.pop();
+ }
return true;
}
DECL_CTOR(TypedWriteInstruction, 1, 1);
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index cac03d6..4de93d3 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -4247,18 +4247,18 @@ OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, fl
OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
+//OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
+//OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, uint4 color);
OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
+//OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float4 color);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
+//OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, float w, int4 color);
OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
+//OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, float u, float v, float w, uint4 color);
OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
+//OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, float u, float v, float w, float4 color);
int __gen_ocl_get_image_width(uint surface_id);
int __gen_ocl_get_image_height(uint surface_id);
int __gen_ocl_get_image_channel_data_type(uint surface_id);
--
1.7.9.5
More information about the Beignet
mailing list