[Beignet] [PATCH 2/2] Use 4bits vector immediate to optimize the a0 setting.
junyan.he at inbox.com
junyan.he at inbox.com
Tue Apr 14 08:02:27 PDT 2015
From: Junyan He <junyan.he at linux.intel.com>
We can use 4bits immediate to load a0 with offset first
and then add the base address use add(8), like:
mov(8) a0<1>:UW 0x563412f0:V { align1 WE_all 1Q };
add(8) a0<1>:W a0<8,8,1>:W 3489:W { align1 WE_all 1Q };
Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
backend/src/backend/gen8_context.cpp | 8 ++---
backend/src/backend/gen8_context.hpp | 2 +-
backend/src/backend/gen_context.cpp | 58 ++++++++++++++++++++++++++--------
backend/src/backend/gen_context.hpp | 2 +-
4 files changed, 50 insertions(+), 20 deletions(-)
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 2cdb248..b525236 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -92,7 +92,7 @@ namespace gbe
new_a0[1] = start_addr + 2;
new_a0[2] = start_addr + 1;
new_a0[3] = start_addr;
- this->setA0Content(new_a0, 0, 4);
+ this->setA0Content(new_a0, 4);
p->push();
p->curr.execWidth = 4;
@@ -152,7 +152,7 @@ namespace gbe
new_a0[14] = start_addr + 1;
new_a0[15] = start_addr;
}
- this->setA0Content(new_a0, 48);
+ this->setA0Content(new_a0);
p->push();
p->curr.execWidth = 16;
@@ -210,7 +210,7 @@ namespace gbe
new_a0[14] = start_addr + 1;
new_a0[15] = start_addr;
}
- this->setA0Content(new_a0, 48);
+ this->setA0Content(new_a0);
p->push();
p->curr.execWidth = 16;
@@ -858,7 +858,7 @@ namespace gbe
this->unpackLongVec(src, dst, p->curr.execWidth);
}
- void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+ void Gen8Context::setA0Content(uint16_t new_a0[16], int sz) {
if (sz == 0)
sz = 16;
GBE_ASSERT(sz%4 == 0);
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index b296a3d..96b3d69 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -72,7 +72,7 @@ namespace gbe
virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
protected:
- virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+ virtual void setA0Content(uint16_t new_a0[16], int sz = 0);
virtual GenEncoder* generateEncoder(void) {
return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
}
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 684ecaf..714da36 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -333,7 +333,7 @@ namespace gbe
new_a0[1] = start_addr + 2;
new_a0[2] = start_addr + 1;
new_a0[3] = start_addr;
- this->setA0Content(new_a0, 0, 4);
+ this->setA0Content(new_a0, 4);
p->push();
p->curr.execWidth = 4;
@@ -377,7 +377,7 @@ namespace gbe
new_a0[6] = start_addr + 1;
new_a0[7] = start_addr;
}
- this->setA0Content(new_a0, 56);
+ this->setA0Content(new_a0);
p->push();
p->curr.execWidth = 8;
@@ -421,7 +421,7 @@ namespace gbe
new_a0[6] = start_addr + 1;
new_a0[7] = start_addr;
}
- this->setA0Content(new_a0, 56);
+ this->setA0Content(new_a0);
p->push();
p->curr.execWidth = 8;
@@ -1946,21 +1946,51 @@ namespace gbe
p->TYPED_WRITE(header, true, bti);
}
- void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+ void GenContext::setA0Content(uint16_t new_a0[16], int sz) {
if (sz == 0)
sz = 8;
- GBE_ASSERT(sz%4 == 0);
- GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
- p->push();
- p->curr.execWidth = 1;
- p->curr.predicate = GEN_PREDICATE_NONE;
- p->curr.noMask = 1;
- for (int i = 0; i < sz/2; i++) {
- p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
- GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+ /* We use a0.0 as base, if all the a0.x sub registers' diff is within 2^4 = [-8, 7],
+ we can use load 8x4bits unsigned int + add offset to optimize it. */
+ bool inRange = true;
+ struct v4bits_help {
+ int8_t dummy:4;
+ int8_t val:4;
+ };
+ uint32_t diff = 0;
+ for (int i = 1; i < sz; i++) {
+ GBE_ASSERT(new_a0[i] >= 0 && new_a0[i] < 4096);
+ int16_t d = new_a0[i] - new_a0[0];
+ if (d < -8 || d > 7) {
+ inRange = false;
+ break;
+ }
+ v4bits_help dv;
+ dv.val = d;
+ diff = diff | ((dv.val & 0xf) << i*4);
+ }
+
+ if (inRange && sz >=4) {
+ p->push();
+ p->curr.execWidth = 8;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->MOV(GenRegister::addr8(0), GenRegister::immv(diff));
+ p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+ GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+ GenRegister::immw(new_a0[0]));
+ p->pop();
+ } else {
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ for (int i = 0; i < (sz+1)/2; i++) {
+ p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+ GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+ }
+ p->pop();
}
- p->pop();
}
BVAR(OCL_OUTPUT_REG_ALLOC, false);
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 560248a..96e0574 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -208,7 +208,7 @@ namespace gbe
/*! allocate a new curbe register and insert to curbe pool. */
void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
- virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+ virtual void setA0Content(uint16_t new_a0[16], int sz = 0);
private:
CompileErrorCode errCode;
--
1.7.9.5
More information about the Beignet
mailing list