[Beignet] [PATCH 1/4] GBE: optimize sample instruction.
Zhigang Gong
zhigang.gong at intel.com
Tue Feb 18 16:38:12 PST 2014
The U,V,W registers could be allocated to a selection vector directly.
Then we can save some MOV instructions for the read_image functions.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/gen_context.cpp | 20 ++++----------------
backend/src/backend/gen_encoder.cpp | 4 ++--
backend/src/backend/gen_encoder.hpp | 2 +-
backend/src/backend/gen_insn_selection.cpp | 26 ++++++++++++--------------
backend/src/backend/gen_insn_selection.hpp | 2 +-
backend/src/ocl_stdlib.tmpl.h | 12 ++++++------
6 files changed, 26 insertions(+), 40 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 7a74856..addf96e 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1784,23 +1784,11 @@ namespace gbe
const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
const unsigned char bti = insn.extra.rdbti;
const unsigned char sampler = insn.extra.sampler;
- const GenRegister ucoord = ra->genReg(insn.src(4));
- const GenRegister vcoord = ra->genReg(insn.src(5));
+ const unsigned int msgLen = insn.extra.rdmsglen;
uint32_t simdWidth = p->curr.execWidth;
- uint32_t coord_cnt = 2;
- p->push();
- const uint32_t nr = msgPayload.nr;
- // prepare mesg desc and move to a0.0.
- // desc = bti | (sampler << 8) | (0 << 12) | (2 << 16) | (0 << 18) | (0 << 19) | (4 << 20) | (1 << 25) | (0 < 29) | (0 << 31)
- /* Prepare message payload. */
- p->MOV(GenRegister::f8grf(nr , 0), ucoord);
- p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
- if (insn.extra.is3DRead) {
- p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), ra->genReg(insn.src(6)));
- coord_cnt++;
- }
- p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
- p->pop();
+ //p->push();
+ p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0);
+ //p->pop();
}
void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index aaf7dce..0664d77 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1207,10 +1207,10 @@ namespace gbe
void GenEncoder::SAMPLE(GenRegister dest,
GenRegister msg,
+ unsigned int msg_len,
bool header_present,
unsigned char bti,
unsigned char sampler,
- unsigned int coord_cnt,
uint32_t simdWidth,
uint32_t writemask,
uint32_t return_format)
@@ -1219,7 +1219,7 @@ namespace gbe
uint32_t msg_type = (simdWidth == 16) ?
GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
uint32_t response_length = (4 * (simdWidth / 8));
- uint32_t msg_length = (coord_cnt * (simdWidth / 8));
+ uint32_t msg_length = (msg_len * (simdWidth / 8));
if (header_present)
msg_length++;
uint32_t simd_mode = (simdWidth == 16) ?
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 13db6ae..094a5c2 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -167,10 +167,10 @@ namespace gbe
/*! Send instruction for the sampler */
void SAMPLE(GenRegister dest,
GenRegister msg,
+ unsigned int msg_len,
bool header_present,
unsigned char bti,
unsigned char sampler,
- unsigned int coord_cnt,
unsigned int simdWidth,
uint32_t writemask,
uint32_t return_format);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 75ee906..d76f580 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -543,7 +543,7 @@ namespace gbe
/*! Encode ternary instructions */
void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
/*! Encode sample instructions */
- void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
+ void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
/*! Encode typed write instructions */
void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
/*! Get image information */
@@ -1415,10 +1415,9 @@ namespace gbe
}
void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
- GenRegister *src, uint32_t srcNum,
GenRegister *msgPayloads, uint32_t msgNum,
uint32_t bti, uint32_t sampler, bool is3D) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
SelectionVector *dstVector = this->appendVector();
SelectionVector *msgVector = this->appendVector();
@@ -1427,8 +1426,6 @@ namespace gbe
insn->dst(elemID) = dst[elemID];
for (uint32_t elemID = 0; elemID < msgNum; ++elemID)
insn->src(elemID) = msgPayloads[elemID];
- for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
- insn->src(msgNum + elemID) = src[elemID];
// Sends require contiguous allocation
dstVector->regNum = dstNum;
@@ -1442,7 +1439,7 @@ namespace gbe
insn->extra.rdbti = bti;
insn->extra.sampler = sampler;
- insn->extra.is3DRead = is3D;
+ insn->extra.rdmsglen = msgNum;
}
///////////////////////////////////////////////////////////////////////////
@@ -3009,23 +3006,24 @@ namespace gbe
{
using namespace ir;
GenRegister msgPayloads[4];
- GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()];
+ GenRegister dst[insn.getDstNum()];
uint32_t srcNum = insn.getSrcNum();
+ uint32_t valueID = 0;
- for( int i = 0; i < 4; ++i)
- msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-
- for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
+ for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
- for (uint32_t valueID = 0; valueID < srcNum; ++valueID)
- src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+ if (!insn.is3D())
+ srcNum--;
+ /* U, V, [W] */
+ for (valueID = 0; valueID < srcNum; ++valueID)
+ msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
uint32_t bti = insn.getImageIndex();
/* We have the clamp border workaround. */
uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8;
- sel.SAMPLE(dst, insn.getDstNum(), src, srcNum, msgPayloads, 4, bti, sampler, insn.is3D());
+ sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, srcNum, bti, sampler, insn.is3D());
return true;
}
DECL_CTOR(SampleInstruction, 1, 1);
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 09e6762..cb80d7c 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -120,7 +120,7 @@ namespace gbe
struct {
uint16_t rdbti:8;
uint16_t sampler:5;
- uint16_t is3DRead:1;
+ uint16_t rdmsglen:3;
};
uint32_t barrierType;
} extra;
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index d2cc144..cac03d6 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -4232,18 +4232,18 @@ int __gen_ocl_force_simd16(void);
// Image access functions
/////////////////////////////////////////////////////////////////////////////
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+//OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+//OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+//OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+//OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+//OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+//OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
--
1.7.9.5
More information about the Beignet
mailing list