[Beignet] [PATCH V2] gbe: Implement a new BTI solution to support dynamic bti
Ruiling Song
ruiling.song at intel.com
Wed May 20 20:07:30 PDT 2015
while the old implementation analyze statically the pointer base, and thus
manage compile time BTIs for all memory access instruction. The new implementation
introduce a virtual register to hold the BTI value for the memory access instruction.
The main benefit of this new method is it can handle storing/loading pointers.
This is a big step towards supporting storing/loading pointers
consider following example:
void @compiler_mixed_pointer1(i32 addrspace(1)* readonly %src, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2) {
%cmp = icmp slt i32 %add4.i, 5
%cond = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
store i32 %6, i32 addrspace(1)* %10, align 4, !tbaa !31
}
will be changed to:
void @compiler_mixed_pointer1(i32 addrspace(1)* readonly %src, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2) {
%cmp = icmp slt i32 %add4.i, 5
// new added instruction:
// %0 hold the value of BTIs, '3' is bti of dst1, '4' is the bti of dst2
// %1 holds the value of starting address for the BTIs, which will be subtracted.
%0 = select i1 %cmp, i32 3, i32 4
%1 = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
%cond = select i1 %cmp, i32 addrspace(1)* %dst1, i32 addrspace(1)* %dst2
store i32 %cond, i32 addrspace(1)* %10, align 4
}
The idea of the solution is: check bti register and select one lane of bti that is not accessed (through 'lzd').
and issue the send message to the bti, and continue get the un-accessed lanes and repeat the steps.
for mixed pointer, the final asm looks like below:
(g118 (offset 0xec0) is register holds bti of all lanes)
((31-lzd(active_lane_mask))*4 + bti_reg_start) is the target bti for this iteration
As the gen backend currently only allow one flag register for one selectionInstruction,
so I have to store the flag at (54) and load at (64) at the example below.
( 38) mov(1) f0.1<2>:UW 0x0UW { align1 WE_all };
( 40) cmp.ne.f0.1(16) null:F f0.1<0,1,0>:UW 0x1UW { align1 WE_normal 1H switch };
( 42) and(1) g8.2<1>:UD f0.1<0,1,0>:UW 0xffffffffUD { align1 WE_all };
( 44) lzd(1) g8.2<1>:UD g8.2<0,1,0>:UD { align1 WE_all };
( 46) add(1) g8.4<2>:UW -g8.4<0,1,0>:UW 0x1fUW { align1 WE_all };
( 48) mul(1) g8.4<2>:UW g8.4<0,1,0>:UW 0x4UW { align1 WE_all };
( 50) add(1) a0<2>:UW g8.4<0,1,0>:UW 0xec0UD { align1 WE_all };
( 52) mov(1) g8.2<1>:UD g[a0]<0,1,0>:UD { align1 WE_all };
( 54) mov(1) g121.14<2>:UW f0.1<0,1,0>:UW { align1 WE_all };
( 56) cmp.e.f0.1(8) null:F g118<8,8,1>:UD g8.2<0,1,0>:UD { align1 WE_normal 1Q switch };
( 58) cmp.e.f0.1(8) null:F g119<8,8,1>:UD g8.2<0,1,0>:UD { align1 WE_normal 2Q switch };
( 60) or(1) a0<1>:UD g8.8<0,1,0>:UB 0x8035e00UD { align1 WE_all };
( 62) (+f0.1) send(16) null:UW g104<8,8,1>:UD a0<0,1,0>:UW
data { align1 WE_normal 1H };
( 64) mov(1) f0.1<2>:UW g121.14<0,1,0>:UW { align1 WE_all };
( 66) (+f0.1) cmp.ne.f0.1(8) null:F g118<8,8,1>:UD g8.2<0,1,0>:UD { align1 WE_normal 1Q switch };
( 68) (+f0.1) cmp.ne.f0.1(8) null:F g119<8,8,1>:UD g8.2<0,1,0>:UD { align1 WE_normal 2Q switch };
( 70) (+f0.1) while(16) -28 { align1 WE_normal 1H };
v2:
1. remove markAllChildrenExceptBTI, instead detach child0 before marking children.
2. fix a signed/unsigned warning in instruction.cpp
3. when the pointer operand of a load/store instruction is same as origin,
don't add it to pointerOrigMap
4. unify elementType when creating PHINode in getPointerBase
5. make separate api setXXXMessageDesc() and generateXXXMessageDesc()
6. refine GenContext::emitStackPointer() and Gen75Context::emitStackPointer().
7. reuse isImageType in function.hpp
8. add function getKernelFunctionMetadata(Function *)
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/gen/gen_mesa_disasm.c | 100 ++---
backend/src/backend/gen75_context.cpp | 4 +-
backend/src/backend/gen75_encoder.cpp | 80 +++-
backend/src/backend/gen75_encoder.hpp | 9 +-
backend/src/backend/gen8_context.cpp | 45 +-
backend/src/backend/gen8_encoder.cpp | 79 +++-
backend/src/backend/gen8_encoder.hpp | 9 +-
backend/src/backend/gen_context.cpp | 192 ++++++++-
backend/src/backend/gen_context.hpp | 2 +
backend/src/backend/gen_encoder.cpp | 172 ++++++--
backend/src/backend/gen_encoder.hpp | 22 +-
backend/src/backend/gen_insn_selection.cpp | 462 +++++++++++++--------
backend/src/backend/gen_insn_selection.hpp | 20 +-
backend/src/backend/program.h | 1 +
backend/src/ir/context.hpp | 8 +-
backend/src/ir/instruction.cpp | 109 +++--
backend/src/ir/instruction.hpp | 36 +-
backend/src/ir/profile.cpp | 4 +-
backend/src/ir/profile.hpp | 3 +-
backend/src/llvm/llvm_gen_backend.cpp | 639 +++++++++++++++++++++++------
20 files changed, 1439 insertions(+), 557 deletions(-)
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 705f5e2..adf4e58 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -99,8 +99,8 @@ static const struct {
[GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
- [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
- [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
+ [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 2, .ndst = 1 },
+ [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 2, .ndst = 1 },
[GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
[GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
@@ -1258,59 +1258,61 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
target, &space);
}
- switch (target) {
- case GEN_SFID_SAMPLER:
- format(file, " (%d, %d, %d, %d)",
- SAMPLE_BTI(inst),
- SAMPLER(inst),
- SAMPLER_MSG_TYPE(inst),
- SAMPLER_SIMD_MODE(inst));
- break;
- case GEN_SFID_DATAPORT_DATA:
- if(UNTYPED_RW_CATEGORY(inst) == 0) {
+ if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
+ switch (target) {
+ case GEN_SFID_SAMPLER:
+ format(file, " (%d, %d, %d, %d)",
+ SAMPLE_BTI(inst),
+ SAMPLER(inst),
+ SAMPLER_MSG_TYPE(inst),
+ SAMPLER_SIMD_MODE(inst));
+ break;
+ case GEN_SFID_DATAPORT_DATA:
+ if(UNTYPED_RW_CATEGORY(inst) == 0) {
+ format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+ UNTYPED_RW_BTI(inst),
+ UNTYPED_RW_RGBA(inst),
+ data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+ data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+ data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+ } else {
+ format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+ SCRATCH_RW_OFFSET(inst),
+ data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
+ data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
+ data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
+ data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
+ }
+ break;
+ case GEN_SFID_DATAPORT1_DATA:
format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
UNTYPED_RW_BTI(inst),
UNTYPED_RW_RGBA(inst),
data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
- data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
- } else {
- format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
- SCRATCH_RW_OFFSET(inst),
- data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
- data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
- data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
- data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
- }
- break;
- case GEN_SFID_DATAPORT1_DATA:
- format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
- UNTYPED_RW_BTI(inst),
- UNTYPED_RW_RGBA(inst),
- data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
- data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
- data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
- break;
- case GEN_SFID_DATAPORT_CONSTANT:
- format(file, " (bti: %d, %s)",
- DWORD_RW_BTI(inst),
- data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
- break;
- case GEN_SFID_MESSAGE_GATEWAY:
- format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
- gateway_sub_function[MSG_GW_SUBFUNC(inst)],
- MSG_GW_NOTIFY(inst),
- MSG_GW_ACKREQ(inst));
- break;
-
- default:
- format(file, "unsupported target %d", target);
- break;
+ data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
+ break;
+ case GEN_SFID_DATAPORT_CONSTANT:
+ format(file, " (bti: %d, %s)",
+ DWORD_RW_BTI(inst),
+ data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
+ break;
+ case GEN_SFID_MESSAGE_GATEWAY:
+ format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
+ gateway_sub_function[MSG_GW_SUBFUNC(inst)],
+ MSG_GW_NOTIFY(inst),
+ MSG_GW_ACKREQ(inst));
+ break;
+
+ default:
+ format(file, "unsupported target %d", target);
+ break;
+ }
+ if (space)
+ string(file, " ");
+ format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
+ format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
}
- if (space)
- string(file, " ");
- format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
- format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
}
pad(file, 64);
if (OPCODE(inst) != GEN_OPCODE_NOP) {
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index a830260..caf7043 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -84,10 +84,9 @@ namespace gbe
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
const GenRegister stackptr = ra->genReg(selStatckPtr);
- const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
- const GenRegister bufferptr = ra->genReg(selStackBuffer);
// We compute the per-lane stack pointer here
+ // private address start from zero
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
@@ -102,7 +101,6 @@ namespace gbe
p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
p->curr.execWidth = this->simdWidth;
- p->ADD(stackptr, stackptr, bufferptr);
p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
p->pop();
}
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index c77ce4d..602f9c7 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -96,8 +96,7 @@ namespace gbe
gen7_insn->bits3.gen7_typed_rw.slot = 1;
}
- void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ unsigned Gen75Encoder::setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
uint32_t msg_length = 0;
uint32_t response_length = 0;
@@ -111,11 +110,6 @@ namespace gbe
} else
NOT_IMPLEMENTED;
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
-
const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
setMessageDescriptor(insn, sfid, msg_length, response_length);
gen7_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
@@ -129,11 +123,26 @@ namespace gbe
gen7_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
else
NOT_SUPPORTED;
+ return gen7_insn->bits3.ud;
}
- void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- assert(elemNum >= 1 || elemNum <= 4);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+ unsigned Gen75Encoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
@@ -144,44 +153,75 @@ namespace gbe
response_length = 2 * elemNum;
} else
NOT_IMPLEMENTED;
-
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(insn,
bti,
untypedRWMask[elemNum],
GEN75_P1_UNTYPED_READ,
msg_length,
response_length);
+ return insn->bits3.ud;
}
- void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+ unsigned Gen75Encoder::setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
- this->setHeader(insn);
if (this->curr.execWidth == 8) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
msg_length = 1 + elemNum;
} else if (this->curr.execWidth == 16) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
msg_length = 2 * (1 + elemNum);
}
else
NOT_IMPLEMENTED;
- this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(insn,
bti,
untypedRWMask[elemNum],
GEN75_P1_UNTYPED_SURFACE_WRITE,
msg_length,
response_length);
+ return insn->bits3.ud;
}
+ void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ }
+ else
+ NOT_IMPLEMENTED;
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+
void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
union { double d; unsigned u[2]; } u;
u.d = value;
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index 9545157..5d80bbd 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -48,15 +48,18 @@ namespace gbe
virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
- virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
- virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
- virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+ virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
virtual void setHeader(GenNativeInstruction *insn);
virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
unsigned char msg_type, uint32_t msg_length,
bool header_present);
+ virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+ virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+ virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
};
}
#endif /* __GBE_GEN75_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 834a3be..69d3916 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -817,19 +817,33 @@ namespace gbe
p->pop();
}
}
-
void Gen8Context::emitRead64Instruction(const SelectionInstruction &insn)
{
- const uint32_t bti = insn.getbti();
const uint32_t elemNum = insn.extra.elem;
GBE_ASSERT(elemNum == 1);
- const GenRegister addr = ra->genReg(insn.src(0));
- const GenRegister tmp_dst = ra->genReg(insn.dst(0));
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const GenRegister src = ra->genReg(insn.src(0));
+ const GenRegister bti = ra->genReg(insn.src(1));
/* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untyperead here. */
- p->UNTYPED_READ(tmp_dst, addr, bti, elemNum*2);
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->UNTYPED_READ(dst, src, bti, 2*elemNum);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(2*elemNum));
+ unsigned desc = p->generateUntypedReadMessageDesc(0, 2*elemNum);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), 2*elemNum);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
GenRegister long_tmp = ra->genReg(insn.dst(elemID));
@@ -840,11 +854,10 @@ namespace gbe
void Gen8Context::emitWrite64Instruction(const SelectionInstruction &insn)
{
- const uint32_t bti = insn.getbti();
const uint32_t elemNum = insn.extra.elem;
GBE_ASSERT(elemNum == 1);
-
const GenRegister addr = ra->genReg(insn.src(elemNum));
+ const GenRegister bti = ra->genReg(insn.src(elemNum*2+1));
/* Because BDW's store and load send instructions for 64 bits require the bti to be surfaceless,
which we can not accept. We just fallback to 2 DW untypewrite here. */
@@ -854,9 +867,23 @@ namespace gbe
this->unpackLongVec(the_long, long_tmp, p->curr.execWidth);
}
- p->UNTYPED_WRITE(addr, bti, elemNum*2);
- }
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->UNTYPED_WRITE(addr, bti, elemNum*2);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+ unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->UNTYPED_WRITE(addr, GenRegister::addr1(0), elemNum*2);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
+ }
void Gen8Context::emitPackLongInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
const GenRegister dst = ra->genReg(insn.dst(0));
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index f02a2ca..fd35838 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -103,9 +103,7 @@ namespace gbe
void Gen8Encoder::F32TO16(GenRegister dest, GenRegister src0) {
MOV(GenRegister::retype(dest, GEN_TYPE_HF), GenRegister::retype(src0, GEN_TYPE_F));
}
-
- void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ unsigned Gen8Encoder::setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
uint32_t msg_length = 0;
uint32_t response_length = 0;
@@ -119,11 +117,6 @@ namespace gbe
} else
NOT_IMPLEMENTED;
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
-
const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
setMessageDescriptor(insn, sfid, msg_length, response_length);
gen8_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
@@ -137,11 +130,26 @@ namespace gbe
gen8_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
else
NOT_SUPPORTED;
+ return gen8_insn->bits3.ud;
}
- void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+ void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- assert(elemNum >= 1 || elemNum <= 4);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+ unsigned Gen8Encoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
@@ -152,44 +160,73 @@ namespace gbe
response_length = 2 * elemNum;
} else
NOT_IMPLEMENTED;
-
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(insn,
bti,
untypedRWMask[elemNum],
GEN75_P1_UNTYPED_READ,
msg_length,
response_length);
+ return insn->bits3.ud;
}
- void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ this->setSrc1(insn, GenRegister::immud(0));
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT1_DATA;
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+ unsigned Gen8Encoder::setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
- this->setHeader(insn);
if (this->curr.execWidth == 8) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
msg_length = 1 + elemNum;
} else if (this->curr.execWidth == 16) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
msg_length = 2 * (1 + elemNum);
}
else
NOT_IMPLEMENTED;
- this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(insn,
bti,
untypedRWMask[elemNum],
GEN75_P1_UNTYPED_SURFACE_WRITE,
msg_length,
response_length);
+ return insn->bits3.ud;
}
+ void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ }
+ else
+ NOT_IMPLEMENTED;
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
void Gen8Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
union { double d; unsigned u[2]; } u;
u.d = value;
diff --git a/backend/src/backend/gen8_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
index 4c5e556..504e13d 100644
--- a/backend/src/backend/gen8_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -49,9 +49,9 @@ namespace gbe
virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
virtual void LOAD_INT64_IMM(GenRegister dest, GenRegister value);
- virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
- virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
- virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
+ virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
virtual void setHeader(GenNativeInstruction *insn);
virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
@@ -66,6 +66,9 @@ namespace gbe
GenRegister src0, GenRegister src1, GenRegister src2);
virtual bool canHandleLong(uint32_t opcode, GenRegister dst, GenRegister src0,
GenRegister src1 = GenRegister::null());
+ virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+ virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+ virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
};
}
#endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 94094fc..43d14d2 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -192,10 +192,10 @@ namespace gbe
GenRegister::ud8grf(ir::ocl::stackptr) :
GenRegister::ud16grf(ir::ocl::stackptr);
const GenRegister stackptr = ra->genReg(selStatckPtr);
- const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
- const GenRegister bufferptr = ra->genReg(selStackBuffer);
// We compute the per-lane stack pointer here
+ // threadId * perThreadSize + laneId*perLaneSize
+ // let private address start from zero
p->push();
p->curr.execWidth = 1;
p->curr.predicate = GEN_PREDICATE_NONE;
@@ -205,7 +205,6 @@ namespace gbe
p->curr.execWidth = 1;
p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
p->curr.execWidth = this->simdWidth;
- p->ADD(stackptr, stackptr, bufferptr);
p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
p->pop();
}
@@ -1721,9 +1720,25 @@ namespace gbe
const GenRegister src = ra->genReg(insn.src(0));
const GenRegister dst = ra->genReg(insn.dst(0));
const uint32_t function = insn.extra.function;
- const uint32_t bti = insn.getbti();
+ unsigned srcNum = insn.extra.elem;
+
+ const GenRegister bti = ra->genReg(insn.src(srcNum));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->ATOMIC(dst, function, src, bti, srcNum);
+ } else {
+ GenRegister flagTemp = ra->genReg(insn.dst(1));
+
+ unsigned desc = p->generateAtomicMessageDesc(function, 0, srcNum);
- p->ATOMIC(dst, function, src, bti, insn.srcNum);
+ unsigned jip0 = beforeMessage(insn, bti, flagTemp, desc);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->ATOMIC(dst, function, src, GenRegister::addr1(0), srcNum);
+ p->pop();
+ afterMessage(insn, bti, flagTemp, jip0);
+ }
}
void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
@@ -1855,48 +1870,188 @@ namespace gbe
}
void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
- const uint32_t elemNum = insn.extra.elem;
+ const uint32_t elemNum = insn.extra.elem * 2;
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.getbti();
- p->UNTYPED_READ(dst, src, bti, elemNum*2);
+ const GenRegister bti = ra->genReg(insn.src(1));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->UNTYPED_READ(dst, src, bti, elemNum);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+ unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
+ }
+ unsigned GenContext::beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned desc) {
+ const GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+ setFlag(flagReg, GenRegister::immuw(0));
+ p->CMP(GEN_CONDITIONAL_NZ, flagReg, GenRegister::immuw(1));
+
+ GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+ GenRegister btiUW = ra->genReg(GenRegister::uw1grf(ir::ocl::btiUtil));
+ GenRegister btiUB = ra->genReg(GenRegister::ub1grf(ir::ocl::btiUtil));
+ unsigned jip0 = p->n_instruction();
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.noMask = 1;
+ p->AND(btiUD, flagReg, GenRegister::immud(0xffffffff));
+ p->LZD(btiUD, btiUD);
+ p->ADD(btiUW, GenRegister::negate(btiUW), GenRegister::immuw(0x1f));
+ p->MUL(btiUW, btiUW, GenRegister::immuw(0x4));
+ p->ADD(GenRegister::addr1(0), btiUW, GenRegister::immud(bti.nr*32));
+ p->MOV(btiUD, GenRegister::indirect(GEN_TYPE_UD, 0, GEN_WIDTH_1, GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL, GEN_HORIZONTAL_STRIDE_0));
+ //save flag
+ p->MOV(tmp, flagReg);
+ p->pop();
+
+ p->CMP(GEN_CONDITIONAL_Z, bti, btiUD);
+ p->push();
+ p->curr.execWidth = 1;
+ p->curr.noMask = 1;
+ p->OR(GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), btiUB, GenRegister::immud(desc));
+ p->pop();
+ return jip0;
+ }
+ void GenContext::afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister tmp, unsigned jip0) {
+ const GenRegister btiUD = ra->genReg(GenRegister::ud1grf(ir::ocl::btiUtil));
+ //restore flag
+ setFlag(GenRegister::flag(insn.state.flag, insn.state.subFlag), tmp);
+ // get active channel
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->CMP(GEN_CONDITIONAL_NZ, bti, btiUD);
+ unsigned jip1 = p->n_instruction();
+ p->WHILE(GenRegister::immud(0));
+ p->pop();
+ p->patchJMPI(jip1, jip0 - jip1, 0);
}
void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.getbti();
+ const GenRegister bti = ra->genReg(insn.src(1));
+
const uint32_t elemNum = insn.extra.elem;
- p->UNTYPED_READ(dst, src, bti, elemNum);
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->UNTYPED_READ(dst, src, bti, elemNum);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(elemNum));
+ unsigned desc = p->generateUntypedReadMessageDesc(0, elemNum);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->UNTYPED_READ(dst, src, GenRegister::retype(GenRegister::addr1(0), GEN_TYPE_UD), elemNum);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
}
void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.dst(0));
const uint32_t elemNum = insn.extra.elem;
- const uint32_t bti = insn.getbti();
- p->UNTYPED_WRITE(src, bti, elemNum*2);
+ const GenRegister bti = ra->genReg(insn.src(elemNum+1));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->UNTYPED_WRITE(src, bti, elemNum*2);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum*2);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum*2);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
}
void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.getbti();
const uint32_t elemNum = insn.extra.elem;
- p->UNTYPED_WRITE(src, bti, elemNum);
+ const GenRegister bti = ra->genReg(insn.src(elemNum+1));
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->UNTYPED_WRITE(src, bti, elemNum);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ unsigned desc = p->generateUntypedWriteMessageDesc(0, elemNum);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->UNTYPED_WRITE(src, GenRegister::addr1(0), elemNum);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
}
void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.getbti();
+ const GenRegister bti = ra->genReg(insn.src(1));
const uint32_t elemSize = insn.extra.elem;
- p->BYTE_GATHER(dst, src, bti, elemSize);
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->BYTE_GATHER(dst, src, bti, elemSize);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(1));
+ unsigned desc = p->generateByteGatherMessageDesc(0, elemSize);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->BYTE_GATHER(dst, src, GenRegister::addr1(0), elemSize);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
}
void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
const GenRegister src = ra->genReg(insn.src(0));
- const uint32_t bti = insn.getbti();
const uint32_t elemSize = insn.extra.elem;
- p->BYTE_SCATTER(src, bti, elemSize);
+ const GenRegister bti = ra->genReg(insn.src(2));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ p->BYTE_SCATTER(src, bti, elemSize);
+ } else {
+ const GenRegister tmp = ra->genReg(insn.dst(0));
+ unsigned desc = p->generateByteScatterMessageDesc(0, elemSize);
+
+ unsigned jip0 = beforeMessage(insn, bti, tmp, desc);
+
+ //predicated load
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NORMAL;
+ p->curr.useFlag(insn.state.flag, insn.state.subFlag);
+ p->BYTE_SCATTER(src, GenRegister::addr1(0), elemSize);
+ p->pop();
+ afterMessage(insn, bti, tmp, jip0);
+ }
+
}
void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
@@ -2032,6 +2187,7 @@ namespace gbe
allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
allocCurbeReg(zero, GBE_CURBE_ZERO);
allocCurbeReg(one, GBE_CURBE_ONE);
+ allocCurbeReg(btiUtil, GBE_CURBE_BTI_UTIL);
if (stackUse.size() != 0)
allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
// Go over the arguments and find the related patch locations
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 560248a..a85657c 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -169,6 +169,8 @@ namespace gbe
virtual void emitI64DIVREMInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+ unsigned beforeMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned desc);
+ void afterMessage(const SelectionInstruction &insn, GenRegister bti, GenRegister flagTemp, unsigned jip0);
/*! Implements base class */
virtual Kernel *allocateKernel(void);
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 5aa8c5c..cac29e8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -329,10 +329,13 @@ namespace gbe
GEN_UNTYPED_ALPHA,
0
};
+ unsigned GenEncoder::generateUntypedReadMessageDesc(unsigned bti, unsigned elemNum) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setUntypedReadMessageDesc(&insn, bti, elemNum);
+ }
- void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- assert(elemNum >= 1 || elemNum <= 4);
+ unsigned GenEncoder::setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
@@ -340,49 +343,88 @@ namespace gbe
response_length = elemNum;
} else if (this->curr.execWidth == 16) {
msg_length = 2;
- response_length = 2*elemNum;
+ response_length = 2 * elemNum;
} else
NOT_IMPLEMENTED;
-
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(insn,
bti,
untypedRWMask[elemNum],
GEN7_UNTYPED_READ,
msg_length,
response_length);
+ return insn->bits3.ud;
}
- void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+ void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
assert(elemNum >= 1 || elemNum <= 4);
+
+ this->setHeader(insn);
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setUntypedReadMessageDesc(insn, bti.value.ud, elemNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+ unsigned GenEncoder::generateUntypedWriteMessageDesc(unsigned bti, unsigned elemNum) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setUntypedWriteMessageDesc(&insn, bti, elemNum);
+ }
+
+ unsigned GenEncoder::setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
- this->setHeader(insn);
if (this->curr.execWidth == 8) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
- msg_length = 1+elemNum;
+ msg_length = 1 + elemNum;
} else if (this->curr.execWidth == 16) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
- msg_length = 2*(1+elemNum);
+ msg_length = 2 * (1 + elemNum);
}
else
NOT_IMPLEMENTED;
- this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPUntypedRW(insn,
bti,
untypedRWMask[elemNum],
GEN7_UNTYPED_WRITE,
msg_length,
response_length);
+ return insn->bits3.ud;
}
- void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
+ void GenEncoder::UNTYPED_WRITE(GenRegister msg, GenRegister bti, uint32_t elemNum) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ assert(elemNum >= 1 || elemNum <= 4);
+ this->setHeader(insn);
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ }
+ else
+ NOT_IMPLEMENTED;
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setUntypedWriteMessageDesc(insn, bti.value.ud, elemNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+ unsigned GenEncoder::generateByteGatherMessageDesc(unsigned bti, unsigned elemSize) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setByteGatherMessageDesc(&insn, bti, elemSize);
+ }
+
+ unsigned GenEncoder::setByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
if (this->curr.execWidth == 8) {
@@ -393,11 +435,6 @@ namespace gbe
response_length = 2;
} else
NOT_IMPLEMENTED;
-
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
setDPByteScatterGather(this,
insn,
bti,
@@ -405,23 +442,42 @@ namespace gbe
GEN7_BYTE_GATHER,
msg_length,
response_length);
+ return insn->bits3.ud;
+
}
- void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
+ void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setByteGatherMessageDesc(insn, bti.value.ud, elemSize);
+ } else {
+ this->setSrc1(insn, bti);
+ }
+ }
+
+ unsigned GenEncoder::generateByteScatterMessageDesc(unsigned bti, unsigned elemSize) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setByteScatterMessageDesc(&insn, bti, elemSize);
+ }
+
+ unsigned GenEncoder::setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
- this->setHeader(insn);
if (this->curr.execWidth == 8) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
msg_length = 2;
} else if (this->curr.execWidth == 16) {
- this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
msg_length = 4;
} else
NOT_IMPLEMENTED;
- this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
+
setDPByteScatterGather(this,
insn,
bti,
@@ -429,6 +485,30 @@ namespace gbe
GEN7_BYTE_SCATTER,
msg_length,
response_length);
+ return insn->bits3.ud;
+ }
+
+ void GenEncoder::BYTE_SCATTER(GenRegister msg, GenRegister bti, uint32_t elemSize) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ if (this->curr.execWidth == 8) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+ } else if (this->curr.execWidth == 16) {
+ this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+ } else
+ NOT_IMPLEMENTED;
+
+ this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setByteScatterMessageDesc(insn, bti.value.ud, elemSize);
+ } else {
+ this->setSrc1(insn, bti);
+ }
}
void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
@@ -461,8 +541,13 @@ namespace gbe
}
- void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
- GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ unsigned GenEncoder::generateAtomicMessageDesc(unsigned function, unsigned bti, unsigned srcNum) {
+ GenNativeInstruction insn;
+ memset(&insn, 0, sizeof(GenNativeInstruction));
+ return setAtomicMessageDesc(&insn, function, bti, srcNum);
+ }
+
+ unsigned GenEncoder::setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum) {
uint32_t msg_length = 0;
uint32_t response_length = 0;
@@ -470,16 +555,11 @@ namespace gbe
msg_length = srcNum;
response_length = 1;
} else if (this->curr.execWidth == 16) {
- msg_length = 2*srcNum;
+ msg_length = 2 * srcNum;
response_length = 2;
} else
NOT_IMPLEMENTED;
- this->setHeader(insn);
- this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
- this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
- this->setSrc1(insn, GenRegister::immud(0));
-
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
setMessageDescriptor(insn, sfid, msg_length, response_length);
insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
@@ -493,7 +573,23 @@ namespace gbe
insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
else
NOT_SUPPORTED;
+ return insn->bits3.ud;
+ }
+
+ void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum) {
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ insn->header.destreg_or_condmod = GEN_SFID_DATAPORT_DATA;
+
+ this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+ this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+ if (bti.file == GEN_IMMEDIATE_VALUE) {
+ this->setSrc1(insn, GenRegister::immud(0));
+ setAtomicMessageDesc(insn, function, bti.value.ud, srcNum);
+ } else {
+ this->setSrc1(insn, bti);
+ }
}
GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
GenCompactInstruction insn;
@@ -893,6 +989,8 @@ namespace gbe
ALU2_BRA(BRD)
ALU2_BRA(BRC)
+ // jip is the distance between jump instruction and jump-target. we have handled
+ // pre/post-increment in patchJMPI() function body
void GenEncoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
GBE_ASSERT(insnID < this->store.size());
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 21faabc..79e7b6e 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -169,15 +169,15 @@ namespace gbe
/*! Wait instruction (used for the barrier) */
void WAIT(void);
/*! Atomic instructions */
- virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+ virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, GenRegister bti, uint32_t srcNum);
/*! Untyped read (upto 4 channels) */
- virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+ virtual void UNTYPED_READ(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemNum);
/*! Untyped write (upto 4 channels) */
- virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+ virtual void UNTYPED_WRITE(GenRegister src, GenRegister bti, uint32_t elemNum);
/*! Byte gather (for unaligned bytes, shorts and ints) */
- void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
+ void BYTE_GATHER(GenRegister dst, GenRegister src, GenRegister bti, uint32_t elemSize);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
- void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+ void BYTE_SCATTER(GenRegister src, GenRegister bti, uint32_t elemSize);
/*! DWord gather (for constant cache read) */
void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
/*! for scratch memory read */
@@ -230,6 +230,18 @@ namespace gbe
void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
unsigned msg_length, unsigned response_length,
bool header_present = false, bool end_of_thread = false);
+ virtual unsigned setAtomicMessageDesc(GenNativeInstruction *insn, unsigned function, unsigned bti, unsigned srcNum);
+ virtual unsigned setUntypedReadMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+ virtual unsigned setUntypedWriteMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemNum);
+ unsigned setByteGatherMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+ unsigned setByteScatterMessageDesc(GenNativeInstruction *insn, unsigned bti, unsigned elemSize);
+
+ unsigned generateAtomicMessageDesc(unsigned function, unsigned bti, unsigned srcNum);
+ unsigned generateUntypedReadMessageDesc(unsigned bti, unsigned elemNum);
+ unsigned generateUntypedWriteMessageDesc(unsigned bti, unsigned elemNum);
+ unsigned generateByteGatherMessageDesc(unsigned bti, unsigned elemSize);
+ unsigned generateByteScatterMessageDesc(unsigned bti, unsigned elemSize);
+
virtual void setHeader(GenNativeInstruction *insn) = 0;
virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg) = 0;
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7d4ea00..a68d0ce 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -598,19 +598,19 @@ namespace gbe
/*! Wait instruction (used for the barrier) */
void WAIT(void);
/*! Atomic instruction */
- void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
+ void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp);
/*! Read 64 bits float/int array */
- void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, uint32_t bti, bool native_long);
+ void READ64(Reg addr, const GenRegister *dst, const GenRegister *tmp, uint32_t elemNum, const GenRegister bti, bool native_long, GenRegister *flagTemp);
/*! Write 64 bits float/int array */
- void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, uint32_t bti, bool native_long);
+ void WRITE64(Reg addr, const GenRegister *src, const GenRegister *tmp, uint32_t srcNum, GenRegister bti, bool native_long, GenRegister *flagTemp);
/*! Untyped read (up to 4 elements) */
- void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+ void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
/*! Untyped write (up to 4 elements) */
- void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+ void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, GenRegister bti, GenRegister *flagTemp);
/*! Byte gather (for unaligned bytes, shorts and ints) */
- void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
+ void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
/*! Byte scatter (for unaligned bytes, shorts and ints) */
- void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+ void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp);
/*! DWord scatter (for constant cache read) */
void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
/*! Unpack the uint to charN */
@@ -1204,16 +1204,26 @@ namespace gbe
void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
uint32_t srcNum, Reg src0,
- Reg src1, Reg src2, uint32_t bti) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
+ Reg src1, Reg src2, GenRegister bti, GenRegister *flagTemp) {
+ unsigned dstNum = flagTemp == NULL ? 1 : 2;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, dstNum, srcNum + 1);
+
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
+ }
+
insn->dst(0) = dst;
+ if(flagTemp) insn->dst(1) = *flagTemp;
+
insn->src(0) = src0;
if(srcNum > 1) insn->src(1) = src1;
if(srcNum > 2) insn->src(2) = src2;
+ insn->src(srcNum) = bti;
insn->extra.function = function;
- insn->setbti(bti);
- SelectionVector *vector = this->appendVector();
+ insn->extra.elem = srcNum;
+ SelectionVector *vector = this->appendVector();
vector->regNum = srcNum;
vector->reg = &insn->src(0);
vector->isSrc = 1;
@@ -1227,22 +1237,29 @@ namespace gbe
const GenRegister *dst,
const GenRegister *tmp,
uint32_t elemNum,
- uint32_t bti,
- bool native_long)
+ const GenRegister bti,
+ bool native_long,
+ GenRegister *flagTemp)
{
SelectionInstruction *insn = NULL;
SelectionVector *srcVector = NULL;
SelectionVector *dstVector = NULL;
if (!native_long) {
- insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
+ unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+ insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
srcVector = this->appendVector();
dstVector = this->appendVector();
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID) = dst[elemID];
+
+ // flagTemp don't need to be put in SelectionVector
+ if (flagTemp)
+ insn->dst(elemNum) = *flagTemp;
} else {
- insn = this->appendInsn(SEL_OP_READ64, elemNum*2, 1);
+ unsigned dstNum = flagTemp == NULL ? elemNum*2 : elemNum*2+1;
+ insn = this->appendInsn(SEL_OP_READ64, dstNum, 2);
srcVector = this->appendVector();
dstVector = this->appendVector();
@@ -1251,10 +1268,20 @@ namespace gbe
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID + elemNum) = dst[elemID];
+
+ // flagTemp don't need to be put in SelectionVector
+ if (flagTemp)
+ insn->dst(2*elemNum) = *flagTemp;
+ }
+
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
}
insn->src(0) = addr;
- insn->setbti(bti);
+ insn->src(1) = bti;
+
insn->extra.elem = elemNum;
dstVector->regNum = elemNum;
@@ -1269,9 +1296,11 @@ namespace gbe
void Selection::Opaque::UNTYPED_READ(Reg addr,
const GenRegister *dst,
uint32_t elemNum,
- uint32_t bti)
+ GenRegister bti,
+ GenRegister *flagTemp)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
+ unsigned dstNum = flagTemp == NULL ? elemNum : elemNum+1;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, dstNum, 2);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
if (this->isScalarReg(dst[0].reg()))
@@ -1279,8 +1308,16 @@ namespace gbe
// Regular instruction to encode
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->dst(elemID) = dst[elemID];
+ if (flagTemp)
+ insn->dst(elemNum) = *flagTemp;
+
insn->src(0) = addr;
- insn->setbti(bti);
+ insn->src(1) = bti;
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
+ }
+
insn->extra.elem = elemNum;
// Sends require contiguous allocation
@@ -1297,31 +1334,40 @@ namespace gbe
const GenRegister *src,
const GenRegister *tmp,
uint32_t srcNum,
- uint32_t bti,
- bool native_long)
+ GenRegister bti,
+ bool native_long,
+ GenRegister *flagTemp)
{
SelectionVector *vector = NULL;
SelectionInstruction *insn = NULL;
if (!native_long) {
- insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
+ unsigned dstNum = flagTemp == NULL ? 0 : 1;
+ insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum + 2);
vector = this->appendVector();
- // Regular instruction to encode
+ // Register layout:
+ // dst: (flagTemp)
+ // src: addr, srcNum, bti
insn->src(0) = addr;
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(elemID + 1) = src[elemID];
- insn->setbti(bti);
+ insn->src(srcNum+1) = bti;
+ if (flagTemp)
+ insn->dst(0) = *flagTemp;
insn->extra.elem = srcNum;
vector->regNum = srcNum + 1;
vector->reg = &insn->src(0);
vector->isSrc = 1;
} else { // handle the native long case
- insn = this->appendInsn(SEL_OP_WRITE64, srcNum, srcNum*2 + 1);
+ unsigned dstNum = flagTemp == NULL ? srcNum : srcNum+1;
+ // Register layout:
+ // dst: srcNum, (flagTemp)
+ // src: srcNum, addr, srcNum, bti.
+ insn = this->appendInsn(SEL_OP_WRITE64, dstNum, srcNum*2 + 2);
vector = this->appendVector();
- insn->src(0) = addr;
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(elemID) = src[elemID];
@@ -1329,33 +1375,50 @@ namespace gbe
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->src(srcNum + 1 + elemID) = tmp[0];
+ insn->src(srcNum*2+1) = bti;
/* We also need to add the tmp reigster to dst, in order
to avoid the post schedule error . */
for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
insn->dst(elemID) = tmp[0];
- insn->setbti(bti);
+ if (flagTemp)
+ insn->dst(srcNum) = *flagTemp;
insn->extra.elem = srcNum;
vector->regNum = srcNum + 1;
vector->reg = &insn->src(srcNum);
vector->isSrc = 1;
}
+
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
+ }
}
void Selection::Opaque::UNTYPED_WRITE(Reg addr,
const GenRegister *src,
uint32_t elemNum,
- uint32_t bti)
+ GenRegister bti,
+ GenRegister *flagTemp)
{
- SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
+ unsigned dstNum = flagTemp == NULL ? 0 : 1;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, dstNum, elemNum+2);
SelectionVector *vector = this->appendVector();
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
+ }
+
+ if (flagTemp) insn->dst(0) = *flagTemp;
// Regular instruction to encode
insn->src(0) = addr;
for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
insn->src(elemID+1) = src[elemID];
- insn->setbti(bti);
+ insn->src(elemNum+1) = bti;
+ if (flagTemp)
+ insn->src(elemNum+2) = *flagTemp;
insn->extra.elem = elemNum;
// Sends require contiguous allocation for the sources
@@ -1364,17 +1427,26 @@ namespace gbe
vector->isSrc = 1;
}
- void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
+ void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
+ unsigned dstNum = flagTemp == NULL ? 1 : 2;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, dstNum, 2);
SelectionVector *srcVector = this->appendVector();
SelectionVector *dstVector = this->appendVector();
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
+ }
+
if (this->isScalarReg(dst.reg()))
insn->state.noMask = 1;
// Instruction to encode
insn->src(0) = addr;
+ insn->src(1) = bti;
insn->dst(0) = dst;
- insn->setbti(bti);
+ if (flagTemp)
+ insn->dst(1) = *flagTemp;
+
insn->extra.elem = elemSize;
// byte gather requires vector in the sense that scalar are not allowed
@@ -1387,14 +1459,22 @@ namespace gbe
srcVector->reg = &insn->src(0);
}
- void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
- SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
+ void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, GenRegister bti, GenRegister *flagTemp) {
+ unsigned dstNum = flagTemp == NULL ? 0 : 1;
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, dstNum, 3);
SelectionVector *vector = this->appendVector();
+ if (bti.file != GEN_IMMEDIATE_VALUE) {
+ insn->state.flag = 0;
+ insn->state.subFlag = 1;
+ }
+
+ if (flagTemp)
+ insn->dst(0) = *flagTemp;
// Instruction to encode
insn->src(0) = addr;
insn->src(1) = src;
- insn->setbti(bti);
+ insn->src(2) = bti;
insn->extra.elem = elemSize;
// value and address are contiguous in the send
@@ -3122,34 +3202,24 @@ namespace gbe
}
}
- /*! Load instruction pattern */
- DECL_PATTERN(LoadInstruction)
+ class LoadInstructionPattern : public SelectionPattern
{
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ LoadInstructionPattern(void) : SelectionPattern(1, 1) {
+ this->opcodes.push_back(ir::OP_LOAD);
+ }
void readDWord(Selection::Opaque &sel,
vector<GenRegister> &dst,
- vector<GenRegister> &dst2,
GenRegister addr,
uint32_t valueNum,
ir::BTI bti) const
{
- for (uint32_t x = 0; x < bti.count; x++) {
- if(x > 0)
- for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
- dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
-
- GenRegister temp = getRelativeAddress(sel, addr, bti.bti[x]);
- sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
- if(x > 0) {
- sel.push();
- if(sel.isScalarReg(dst[0].reg())) {
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
- }
- for (uint32_t y = 0; y < valueNum; y++)
- sel.ADD(dst[y], dst[y], dst2[y]);
- sel.pop();
- }
- }
+ //GenRegister temp = getRelativeAddress(sel, addr, sel.selReg(bti.base, ir::TYPE_U32));
+
+ GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ GenRegister tmp = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+ sel.UNTYPED_READ(addr, dst.data(), valueNum, b, bti.isConst ? NULL : &tmp);
}
void emitUntypedRead(Selection::Opaque &sel,
@@ -3160,10 +3230,9 @@ namespace gbe
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector<GenRegister> dst(valueNum);
- vector<GenRegister> dst2(valueNum);
for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
- dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
- readDWord(sel, dst, dst2, addr, valueNum, bti);
+ dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+ readDWord(sel, dst, addr, valueNum, bti);
}
void emitDWordGather(Selection::Opaque &sel,
@@ -3172,15 +3241,15 @@ namespace gbe
ir::BTI bti) const
{
using namespace ir;
- GBE_ASSERT(bti.count == 1);
- const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
+ GBE_ASSERT(bti.isConst == 1);
GBE_ASSERT(insn.getValueNum() == 1);
+ const uint32_t isUniform = sel.isScalarReg(insn.getValue(0));
if(isUniform) {
GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
sel.push();
sel.curr.noMask = 1;
- sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
+ sel.SAMPLE(&dst, 1, &addr, 1, bti.imm, 0, true, true);
sel.pop();
return;
}
@@ -3196,7 +3265,7 @@ namespace gbe
sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
sel.pop();
- sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
+ sel.DWORD_GATHER(dst, addrDW, bti.imm);
}
void emitRead64(Selection::Opaque &sel,
@@ -3208,9 +3277,10 @@ namespace gbe
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- GBE_ASSERT(bti.count == 1);
+ GBE_ASSERT(bti.isConst == 1);
vector<GenRegister> dst(valueNum);
- GenRegister tmpAddr = getRelativeAddress(sel, addr, bti.bti[0]);
+ GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
@@ -3220,9 +3290,9 @@ namespace gbe
tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
}
- sel.READ64(tmpAddr, dst.data(), tmp.data(), valueNum, bti.bti[0], true);
+ sel.READ64(addr, dst.data(), tmp.data(), valueNum, b, true, bti.isConst ? NULL : &tmpFlag);
} else {
- sel.READ64(tmpAddr, dst.data(), NULL, valueNum, bti.bti[0], false);
+ sel.READ64(addr, dst.data(), NULL, valueNum, b, false, bti.isConst ? NULL : &tmpFlag);
}
}
@@ -3231,12 +3301,16 @@ namespace gbe
GenRegister address,
GenRegister dst,
bool isUniform,
- uint8_t bti) const
+ ir::BTI bti) const
{
using namespace ir;
Register tmpReg = sel.reg(FAMILY_DWORD, isUniform);
GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
+
+ GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+
// Get dword aligned addr
sel.push();
if (isUniform) {
@@ -3248,7 +3322,7 @@ namespace gbe
sel.push();
if (isUniform)
sel.curr.noMask = 1;
- sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
+ sel.UNTYPED_READ(tmpAddr, &tmpData, 1, b, bti.isConst ? NULL : &tmpFlag);
if (isUniform)
sel.curr.execWidth = 1;
@@ -3284,14 +3358,11 @@ namespace gbe
uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
vector<GenRegister> tmp(tmpRegNum);
- vector<GenRegister> tmp2(tmpRegNum);
- vector<Register> tmpReg(tmpRegNum);
for(uint32_t i = 0; i < tmpRegNum; i++) {
- tmpReg[i] = sel.reg(FAMILY_DWORD, isUniform);
- tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32);
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
}
- readDWord(sel, tmp, tmp2, address, tmpRegNum, bti);
+ readDWord(sel, tmp, address, tmpRegNum, bti);
for(uint32_t i = 0; i < tmpRegNum; i++) {
unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
@@ -3396,7 +3467,7 @@ namespace gbe
sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
sel.pop();
}
- readDWord(sel, t1, t2, alignedAddr, width, bti);
+ readDWord(sel, t1, alignedAddr, width, bti);
remainedReg -= width;
pos += width;
} while(remainedReg);
@@ -3415,51 +3486,39 @@ namespace gbe
GBE_ASSERT(insn.getValueNum() == 1);
const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
- GenRegister tmp = value;
-
- for (int x = 0; x < bti.count; x++) {
- if (x > 0)
- tmp = sel.selReg(sel.reg(family, isUniform), insn.getValueType());
- GenRegister addr = getRelativeAddress(sel, address, bti.bti[x]);
- readByteAsDWord(sel, elemSize, addr, tmp, isUniform, bti.bti[x]);
- if (x > 0) {
- sel.push();
- if (isUniform) {
- sel.curr.noMask = 1;
- sel.curr.execWidth = 1;
- }
- sel.ADD(value, value, tmp);
- sel.pop();
- }
- }
+ readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
}
}
- INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti) const {
- if (bti == 0xfe || bti == BTI_CONSTANT)
- return address;
-
- sel.push();
- sel.curr.noMask = 1;
- if (GenRegister::hstride_size(address) == 0)
- sel.curr.execWidth = 1;
- GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, sel.curr.execWidth == 1), ir::TYPE_U32);
- sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
- sel.pop();
- return temp;
- }
// check whether all binded table index point to constant memory
INLINE bool isAllConstant(const ir::BTI &bti) const {
- for (int x = 0; x < bti.count; x++) {
- if (bti.bti[x] != BTI_CONSTANT)
- return false;
+ if (bti.isConst && bti.imm == BTI_CONSTANT)
+ return true;
+ return false;
+ }
+
+ INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::LoadInstruction &insn) const {
+ using namespace ir;
+ SelectionDAG *child0 = dag.child[0];
+ ir::BTI b;
+ if (insn.isFixedBTI()) {
+ const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+ const auto imm = immInsn.getImmediate();
+ b.isConst = 1;
+ b.imm = imm.getIntegerValue();
+ } else {
+ b.isConst = 0;
+ b.reg = insn.getBTI();
}
- return true;
+ return b;
}
- INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
+ /*! Implements base class */
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+ {
using namespace ir;
+ const ir::LoadInstruction &insn = cast<ir::LoadInstruction>(dag.insn);
GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
insn.getAddressSpace() == MEM_CONSTANT ||
@@ -3467,9 +3526,11 @@ namespace gbe
insn.getAddressSpace() == MEM_LOCAL ||
insn.getAddressSpace() == MEM_MIXED);
//GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+
+ BTI bti = getBTI(dag, insn);
+
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
- const BTI &bti = insn.getBTI();
bool allConstant = isAllConstant(bti);
if (allConstant) {
@@ -3494,65 +3555,79 @@ namespace gbe
else
this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
}
+
+
+ // for fixed bti, don't generate the useless loadi
+ if (insn.isFixedBTI())
+ dag.child[0] = NULL;
+ markAllChildren(dag);
+
return true;
}
- DECL_CTOR(LoadInstruction, 1, 1);
};
-
- /*! Store instruction pattern */
- DECL_PATTERN(StoreInstruction)
+ class StoreInstructionPattern : public SelectionPattern
{
+ public:
+ /*! Register the pattern for all opcodes of the family */
+ StoreInstructionPattern(void) : SelectionPattern(1, 1) {
+ this->opcodes.push_back(ir::OP_STORE);
+ }
void emitUntypedWrite(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
- GenRegister addr,
- uint32_t bti) const
+ GenRegister address,
+ ir::BTI &bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
vector<GenRegister> value(valueNum);
+ GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
- addr = GenRegister::retype(addr, GEN_TYPE_F);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
- value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
- sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+ value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_UD);
+ GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
+ sel.UNTYPED_WRITE(address, value.data(), valueNum, b, bti.isConst? NULL : &tmp);
}
void emitWrite64(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
- GenRegister addr,
- uint32_t bti) const
+ GenRegister address,
+ ir::BTI &bti) const
{
using namespace ir;
const uint32_t valueNum = insn.getValueNum();
/* XXX support scalar only right now. */
GBE_ASSERT(valueNum == 1);
- addr = GenRegister::retype(addr, GEN_TYPE_UD);
+ GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
vector<GenRegister> src(valueNum);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
+ GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
+
if (sel.hasLongType()) {
vector<GenRegister> tmp(valueNum);
for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
tmp[valueID] = GenRegister::retype(sel.selReg(sel.reg(ir::FAMILY_QWORD), ir::TYPE_U64), GEN_TYPE_UL);
}
- sel.WRITE64(addr, src.data(), tmp.data(), valueNum, bti, true);
+ sel.WRITE64(address, src.data(), tmp.data(), valueNum, b, true, bti.isConst? NULL : &tmpFlag);
} else {
- sel.WRITE64(addr, src.data(), NULL, valueNum, bti, false);
+ sel.WRITE64(address, src.data(), NULL, valueNum, b, false, bti.isConst? NULL : &tmpFlag);
}
}
void emitByteScatter(Selection::Opaque &sel,
const ir::StoreInstruction &insn,
const uint32_t elemSize,
- GenRegister addr,
- uint32_t bti,
+ GenRegister address,
+ ir::BTI &bti,
bool isUniform) const
{
using namespace ir;
uint32_t valueNum = insn.getValueNum();
+ GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+ GenRegister tmpFlag = sel.selReg(sel.reg(FAMILY_WORD, true), ir::TYPE_U16);
if(valueNum > 1) {
const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
vector<GenRegister> value(valueNum);
@@ -3572,11 +3647,12 @@ namespace gbe
sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
}
- sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+ sel.UNTYPED_WRITE(address, tmp.data(), tmpRegNum, b, bti.isConst ? NULL : &tmpFlag);
} else {
const GenRegister value = sel.selReg(insn.getValue(0));
GBE_ASSERT(insn.getValueNum() == 1);
const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD, isUniform), ir::TYPE_U32);
+
sel.push();
if (isUniform) {
sel.curr.noMask = 1;
@@ -3588,47 +3664,52 @@ namespace gbe
else if (elemSize == GEN_BYTE_SCATTER_BYTE)
sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
sel.pop();
- sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+ sel.BYTE_SCATTER(address, tmp, elemSize, b, bti.isConst ? NULL : &tmpFlag);
}
}
- INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, uint8_t bti, bool isUniform) const {
- if(bti == 0xfe)
- return address;
- sel.push();
- sel.curr.noMask = 1;
- if (isUniform)
- sel.curr.execWidth = 1;
- GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD, isUniform), ir::TYPE_U32);
- sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
- sel.pop();
- return temp;
+ INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::StoreInstruction &insn) const {
+ using namespace ir;
+ SelectionDAG *child0 = dag.child[0];
+ ir::BTI b;
+ if (insn.isFixedBTI()) {
+ const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+ const auto imm = immInsn.getImmediate();
+ b.isConst = 1;
+ b.imm = imm.getIntegerValue();
+ } else {
+ b.isConst = 0;
+ b.reg = insn.getBTI();
+ }
+ return b;
}
-
- INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
+ virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
{
using namespace ir;
+ const ir::StoreInstruction &insn = cast<ir::StoreInstruction>(dag.insn);
+ GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
const Type type = insn.getValueType();
const uint32_t elemSize = getByteScatterGatherSize(type);
- GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
const bool isUniform = sel.isScalarReg(insn.getAddress()) && sel.isScalarReg(insn.getValue(0));
+ BTI bti = getBTI(dag, insn);
- BTI bti = insn.getBTI();
- for (int x = 0; x < bti.count; x++) {
- GenRegister temp = getRelativeAddress(sel, address, bti.bti[x], isUniform);
- if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
- this->emitWrite64(sel, insn, temp, bti.bti[x]);
- else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
- this->emitUntypedWrite(sel, insn, temp, bti.bti[x]);
- else {
- this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x], isUniform);
- }
+ if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+ this->emitWrite64(sel, insn, address, bti);
+ else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+ this->emitUntypedWrite(sel, insn, address, bti);
+ else {
+ this->emitByteScatter(sel, insn, elemSize, address, bti, isUniform);
}
+
+ // for fixed bti, don't generate the useless loadi
+ if (insn.isFixedBTI())
+ dag.child[0] = NULL;
+ markAllChildren(dag);
+
return true;
}
- DECL_CTOR(StoreInstruction, 1, 1);
};
/*! Compare instruction pattern */
@@ -4226,38 +4307,61 @@ namespace gbe
DECL_CTOR(ConvertInstruction, 1, 1);
};
- /*! Convert instruction pattern */
- DECL_PATTERN(AtomicInstruction)
+ /*! atomic instruction pattern */
+ class AtomicInstructionPattern : public SelectionPattern
{
- INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
- {
+ public:
+ AtomicInstructionPattern(void) : SelectionPattern(1,1) {
+ for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+ if (ir::isOpcodeFrom<ir::AtomicInstruction>(ir::Opcode(op)) == true)
+ this->opcodes.push_back(ir::Opcode(op));
+ }
+
+ INLINE ir::BTI getBTI(SelectionDAG &dag, const ir::AtomicInstruction &insn) const {
+ using namespace ir;
+ SelectionDAG *child0 = dag.child[0];
+ ir::BTI b;
+ if (insn.isFixedBTI()) {
+ const auto &immInsn = cast<LoadImmInstruction>(child0->insn);
+ const auto imm = immInsn.getImmediate();
+ b.isConst = 1;
+ b.imm = imm.getIntegerValue();
+ } else {
+ b.isConst = 0;
+ b.reg = insn.getBTI();
+ }
+ return b;
+ }
+
+ INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
using namespace ir;
+ const ir::AtomicInstruction &insn = cast<ir::AtomicInstruction>(dag.insn);
+
+ ir::BTI b = getBTI(dag, insn);
const AtomicOps atomicOp = insn.getAtomicOpcode();
- const AddressSpace space = insn.getAddressSpace();
- const uint32_t srcNum = insn.getSrcNum();
+ unsigned srcNum = insn.getSrcNum();
+ unsigned opNum = srcNum - 1;
- GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32); //address
- GenRegister src1 = src0, src2 = src0;
- if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
- if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
GenRegister dst = sel.selReg(insn.getDst(0), TYPE_U32);
+ GenRegister bti = b.isConst ? GenRegister::immud(b.imm) : sel.selReg(b.reg, ir::TYPE_U32);
+ GenRegister src0 = sel.selReg(insn.getSrc(1), TYPE_U32); //address
+ GenRegister src1 = src0, src2 = src0;
+ if(srcNum > 2) src1 = sel.selReg(insn.getSrc(2), TYPE_U32);
+ if(srcNum > 3) src2 = sel.selReg(insn.getSrc(3), TYPE_U32);
+
+ GenRegister flagTemp = sel.selReg(sel.reg(FAMILY_WORD, true), TYPE_U16);
+
GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
- if(space == MEM_LOCAL) {
- sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
- } else {
- ir::BTI b = insn.getBTI();
- for (int x = 0; x < b.count; x++) {
- sel.push();
- sel.curr.noMask = 1;
- GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
- sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
- sel.pop();
- sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
- }
- }
+
+ sel.ATOMIC(dst, genAtomicOp, opNum, src0, src1, src2, bti, b.isConst ? NULL : &flagTemp);
+
+ // for fixed bti, don't generate the useless loadi
+ if (insn.isFixedBTI())
+ dag.child[0] = NULL;
+ markAllChildren(dag);
+
return true;
}
- DECL_CTOR(AtomicInstruction, 1, 1);
};
/*! Select instruction pattern */
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 2262ef9..8c6caac 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -100,7 +100,7 @@ namespace gbe
struct {
/*! Store bti for loads/stores and function for math, atomic and compares */
uint16_t function:8;
- /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
+ /*! elemSize for byte scatters / gathers, elemNum for untyped msg, operand number for atomic */
uint16_t elem:8;
};
struct {
@@ -150,14 +150,7 @@ namespace gbe
INLINE uint32_t getbti() const {
GBE_ASSERT(isRead() || isWrite());
switch (opcode) {
- case SEL_OP_ATOMIC: return extra.elem;
- case SEL_OP_BYTE_SCATTER:
- case SEL_OP_WRITE64:
- case SEL_OP_DWORD_GATHER:
- case SEL_OP_UNTYPED_WRITE:
- case SEL_OP_UNTYPED_READ:
- case SEL_OP_BYTE_GATHER:
- case SEL_OP_READ64: return extra.function;
+ case SEL_OP_DWORD_GATHER: return extra.function;
case SEL_OP_SAMPLE: return extra.rdbti;
case SEL_OP_TYPED_WRITE: return extra.bti;
default:
@@ -169,14 +162,7 @@ namespace gbe
INLINE void setbti(uint32_t bti) {
GBE_ASSERT(isRead() || isWrite());
switch (opcode) {
- case SEL_OP_ATOMIC: extra.elem = bti; return;
- case SEL_OP_BYTE_SCATTER:
- case SEL_OP_WRITE64:
- case SEL_OP_UNTYPED_WRITE:
- case SEL_OP_DWORD_GATHER:
- case SEL_OP_UNTYPED_READ:
- case SEL_OP_BYTE_GATHER:
- case SEL_OP_READ64: extra.function = bti; return;
+ case SEL_OP_DWORD_GATHER: extra.function = bti; return;
case SEL_OP_SAMPLE: extra.rdbti = bti; return;
case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
default:
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 8c171f5..3637ebb 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -103,6 +103,7 @@ enum gbe_curbe_type {
GBE_CURBE_ONE,
GBE_CURBE_LANE_ID,
GBE_CURBE_SLM_OFFSET,
+ GBE_CURBE_BTI_UTIL,
};
/*! Extra arguments use the negative range of sub-values */
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index af65ff3..54265d0 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -190,22 +190,22 @@ namespace ir {
/*! LOAD with the destinations directly specified */
template <typename... Args>
- void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+ void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
{
const Tuple index = this->tuple(values...);
const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
GBE_ASSERT(valueNum > 0);
- this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
+ this->LOAD(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
}
/*! STORE with the sources directly specified */
template <typename... Args>
- void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+ void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, bool fixedBTI, Register bti, Args...values)
{
const Tuple index = this->tuple(values...);
const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
GBE_ASSERT(valueNum > 0);
- this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
+ this->STORE(type, index, offset, space, valueNum, dwAligned, fixedBTI, bti);
}
void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 784ae9c..e2c4a14 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -318,14 +318,14 @@ namespace ir {
class ALIGNED_INSTRUCTION AtomicInstruction :
public BasePolicy,
- public TupleSrcPolicy<AtomicInstruction>,
public NDstPolicy<AtomicInstruction, 1>
{
public:
AtomicInstruction(AtomicOps atomicOp,
Register dst,
AddressSpace addrSpace,
- BTI bti,
+ Register bti,
+ bool fixedBTI,
Tuple src)
{
this->opcode = OP_ATOMIC;
@@ -334,23 +334,43 @@ namespace ir {
this->src = src;
this->addrSpace = addrSpace;
this->bti = bti;
+ this->fixedBTI = fixedBTI ? 1: 0;
srcNum = 2;
if((atomicOp == ATOMIC_OP_INC) ||
(atomicOp == ATOMIC_OP_DEC))
srcNum = 1;
if(atomicOp == ATOMIC_OP_CMPXCHG)
srcNum = 3;
+ srcNum++;
}
+ INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+ GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
+ if (ID == 0u)
+ return bti;
+ else
+ return fn.getRegister(src, ID -1);
+ }
+ INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+ GBE_ASSERTM(ID < srcNum, "Out-of-bound source register for atomic");
+ if (ID == 0u)
+ bti = reg;
+ else
+ fn.setRegister(src, ID - 1, reg);
+ }
+ INLINE uint32_t getSrcNum(void) const { return srcNum; }
+
INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
- INLINE BTI getBTI(void) const { return bti; }
+ INLINE Register getBTI(void) const { return bti; }
+ INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
INLINE void out(std::ostream &out, const Function &fn) const;
Register dst[1];
Tuple src;
AddressSpace addrSpace; //!< Address space
- BTI bti; //!< bti
- uint8_t srcNum:2; //!<Source Number
+ Register bti; //!< bti
+ uint8_t fixedBTI:1; //!< fixed bti or not
+ uint8_t srcNum:3; //!<Source Number
AtomicOps atomicOp:6; //!<Source Number
};
@@ -410,7 +430,7 @@ namespace ir {
class ALIGNED_INSTRUCTION LoadInstruction :
public BasePolicy,
- public NSrcPolicy<LoadInstruction, 1>
+ public NSrcPolicy<LoadInstruction, 2>
{
public:
LoadInstruction(Type type,
@@ -419,7 +439,8 @@ namespace ir {
AddressSpace addrSpace,
uint32_t valueNum,
bool dwAligned,
- BTI bti)
+ bool fixedBTI,
+ Register bti)
{
GBE_ASSERT(valueNum < 128);
this->opcode = OP_LOAD;
@@ -429,6 +450,7 @@ namespace ir {
this->addrSpace = addrSpace;
this->valueNum = valueNum;
this->dwAligned = dwAligned ? 1 : 0;
+ this->fixedBTI = fixedBTI ? 1 : 0;
this->bti = bti;
}
INLINE Register getDst(const Function &fn, uint32_t ID) const {
@@ -443,16 +465,18 @@ namespace ir {
INLINE Type getValueType(void) const { return type; }
INLINE uint32_t getValueNum(void) const { return valueNum; }
INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
- INLINE BTI getBTI(void) const { return bti; }
+ INLINE Register getBTI(void) const { return bti; }
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const;
INLINE bool isAligned(void) const { return !!dwAligned; }
+ INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
Type type; //!< Type to store
Register src[0]; //!< Address where to load from
+ Register bti;
Register offset; //!< Alias to make it similar to store
Tuple values; //!< Values to load
AddressSpace addrSpace; //!< Where to load
- BTI bti;
+ uint8_t fixedBTI:1;
uint8_t valueNum:7; //!< Number of values to load
uint8_t dwAligned:1; //!< DWORD aligned is what matters with GEN
};
@@ -467,7 +491,8 @@ namespace ir {
AddressSpace addrSpace,
uint32_t valueNum,
bool dwAligned,
- BTI bti)
+ bool fixedBTI,
+ Register bti)
{
GBE_ASSERT(valueNum < 255);
this->opcode = OP_STORE;
@@ -477,35 +502,42 @@ namespace ir {
this->addrSpace = addrSpace;
this->valueNum = valueNum;
this->dwAligned = dwAligned ? 1 : 0;
+ this->fixedBTI = fixedBTI ? 1 : 0;
this->bti = bti;
}
INLINE Register getSrc(const Function &fn, uint32_t ID) const {
- GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+ GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
if (ID == 0u)
+ return bti;
+ else if (ID == 1u)
return offset;
else
- return fn.getRegister(values, ID - 1);
+ return fn.getRegister(values, ID - 2);
}
INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
- GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+ GBE_ASSERTM(ID < valueNum + 2u, "Out-of-bound source register for store");
if (ID == 0u)
+ bti = reg;
+ else if (ID == 1u)
offset = reg;
else
- fn.setRegister(values, ID - 1, reg);
+ fn.setRegister(values, ID - 2, reg);
}
- INLINE uint32_t getSrcNum(void) const { return valueNum + 1u; }
+ INLINE uint32_t getSrcNum(void) const { return valueNum + 2u; }
INLINE uint32_t getValueNum(void) const { return valueNum; }
INLINE Type getValueType(void) const { return type; }
INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
- INLINE BTI getBTI(void) const { return bti; }
+ INLINE Register getBTI(void) const { return bti; }
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const;
INLINE bool isAligned(void) const { return !!dwAligned; }
+ INLINE bool isFixedBTI(void) const { return !!fixedBTI; }
Type type; //!< Type to store
+ Register bti;
Register offset; //!< First source is the offset where to store
Tuple values; //!< Values to store
AddressSpace addrSpace; //!< Where to store
- BTI bti; //!< Which btis need access
+ uint8_t fixedBTI:1; //!< Which btis need access
uint8_t valueNum:7; //!< Number of values to store
uint8_t dwAligned:1; //!< DWORD aligned is what matters with GEN
Register dst[0]; //!< No destination
@@ -985,10 +1017,12 @@ namespace ir {
return false;
if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
return false;
- for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
- if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
+ for (uint32_t srcID = 0; srcID < srcNum-1u; ++srcID)
+ if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID+1u), fn, whyNot) == false))
return false;
+ if (UNLIKELY(checkRegisterData(FAMILY_DWORD, bti, fn, whyNot) == false))
+ return false;
return true;
}
@@ -1199,12 +1233,10 @@ namespace ir {
this->outOpcode(out);
out << "." << addrSpace;
out << " %" << this->getDst(fn, 0);
- out << " {" << "%" << this->getSrc(fn, 0) << "}";
- for (uint32_t i = 1; i < srcNum; ++i)
+ out << " {" << "%" << this->getSrc(fn, 1) << "}";
+ for (uint32_t i = 2; i < srcNum; ++i)
out << " %" << this->getSrc(fn, i);
- out << " bti";
- for (uint32_t i = 0; i < bti.count; ++i)
- out << ": " << (int)bti.bti[i];
+ out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
}
@@ -1238,22 +1270,18 @@ namespace ir {
for (uint32_t i = 0; i < valueNum; ++i)
out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
out << "}";
- out << " %" << this->getSrc(fn, 0);
- out << " bti";
- for (uint32_t i = 0; i < bti.count; ++i)
- out << ": " << (int)bti.bti[i];
+ out << " %" << this->getSrc(fn, 1);
+ out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
}
INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
- out << " %" << this->getSrc(fn, 0) << " {";
+ out << " %" << this->getSrc(fn, 1) << " {";
for (uint32_t i = 0; i < valueNum; ++i)
- out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
+ out << "%" << this->getSrc(fn, i+2) << (i != (valueNum-1u) ? " " : "");
out << "}";
- out << " bti";
- for (uint32_t i = 0; i < bti.count; ++i)
- out << ": " << (int)bti.bti[i];
+ out << (fixedBTI ? " bti" : " bti(mixed)") << " %" << this->getBTI();
}
INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
@@ -1604,18 +1632,18 @@ DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
+DECL_MEM_FN(AtomicInstruction, bool, isFixedBTI(void), isFixedBTI())
DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(StoreInstruction, bool, isFixedBTI(void), isFixedBTI())
DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
-DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadInstruction, bool, isFixedBTI(void), isFixedBTI())
DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
@@ -1782,8 +1810,8 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
}
// For all unary functions with given opcode
- Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
- return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
+ Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src) {
+ return internal::AtomicInstruction(atomicOp, dst, space, bti, fixedBTI, src).convert();
}
// BRA
@@ -1831,9 +1859,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
AddressSpace space, \
uint32_t valueNum, \
bool dwAligned, \
- BTI bti) \
+ bool fixedBTI, \
+ Register bti) \
{ \
- return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
+ return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,fixedBTI,bti).convert(); \
}
DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 343d12a..ec4d00d 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -36,10 +36,13 @@
namespace gbe {
namespace ir {
struct BTI {
- uint8_t bti[MAX_MIXED_POINTER];
- uint8_t count;
- BTI() : count(0) {
- memset(bti, 0, MAX_MIXED_POINTER);
+ uint8_t isConst; // whether fixed bti
+ union {
+ Register reg; // mixed reg
+ unsigned short imm; // fixed bti
+ };
+
+ BTI() : isConst(0) {
}
~BTI() {}
};
@@ -289,10 +292,12 @@ namespace ir {
class AtomicInstruction : public Instruction {
public:
/*! Where the address register goes */
- static const uint32_t addressIndex = 0;
+ static const uint32_t btiIndex = 0;
+ static const uint32_t addressIndex = 1;
/*! Address space that is manipulated here */
AddressSpace getAddressSpace(void) const;
- BTI getBTI(void) const;
+ Register getBTI(void) const { return this->getSrc(btiIndex); }
+ bool isFixedBTI(void) const;
/*! Return the atomic function code */
AtomicOps getAtomicOpcode(void) const;
/*! Return the register that contains the addresses */
@@ -307,12 +312,14 @@ namespace ir {
class StoreInstruction : public Instruction {
public:
/*! Where the address register goes */
- static const uint32_t addressIndex = 0;
+ static const uint32_t btiIndex = 0;
+ static const uint32_t addressIndex = 1;
/*! Return the types of the values to store */
Type getValueType(void) const;
/*! Give the number of values the instruction is storing (srcNum-1) */
uint32_t getValueNum(void) const;
- BTI getBTI(void) const;
+ Register getBTI(void) const { return this->getSrc(btiIndex); }
+ bool isFixedBTI(void) const;
/*! Address space that is manipulated here */
AddressSpace getAddressSpace(void) const;
/*! DWORD aligned means untyped read for Gen. That is what matters */
@@ -322,7 +329,7 @@ namespace ir {
/*! Return the register that contain value valueID */
INLINE Register getValue(uint32_t valueID) const {
GBE_ASSERT(valueID < this->getValueNum());
- return this->getSrc(valueID + 1u);
+ return this->getSrc(valueID + 2u);
}
/*! Return true if the given instruction is an instance of this class */
static bool isClassOf(const Instruction &insn);
@@ -343,8 +350,9 @@ namespace ir {
/*! DWORD aligned means untyped read for Gen. That is what matters */
bool isAligned(void) const;
/*! Return the register that contains the addresses */
- INLINE Register getAddress(void) const { return this->getSrc(0u); }
- BTI getBTI(void) const;
+ INLINE Register getAddress(void) const { return this->getSrc(1u); }
+ Register getBTI(void) const {return this->getSrc(0u);}
+ bool isFixedBTI(void) const;
/*! Return the register that contain value valueID */
INLINE Register getValue(uint32_t valueID) const {
return this->getDst(valueID);
@@ -708,7 +716,7 @@ namespace ir {
/*! F32TO16.{dstType <- srcType} dst src */
Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
/*! atomic dst addr.space {src1 {src2}} */
- Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
+ Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Register bti, bool fixedBTI, Tuple src);
/*! bra labelIndex */
Instruction BRA(LabelIndex labelIndex);
/*! (pred) bra labelIndex */
@@ -724,9 +732,9 @@ namespace ir {
/*! ret */
Instruction RET(void);
/*! load.type.space {dst1,...,dst_valueNum} offset value */
- Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+ Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
/*! store.type.space offset {src1,...,src_valueNum} value */
- Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+ Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, bool fixedBTI, Register bti);
/*! loadi.type dst value */
Instruction LOADI(Type type, Register dst, ImmediateIndex value);
/*! sync.params... (see Sync instruction) */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 2f6539a..af9f698 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -45,7 +45,8 @@ namespace ir {
"printf_buffer_pointer", "printf_index_buffer_pointer",
"dwblockip",
"lane_id",
- "invalid"
+ "invalid",
+ "bti_utility"
};
#if GBE_DEBUG
@@ -91,6 +92,7 @@ namespace ir {
DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
+ DECL_NEW_REG(FAMILY_DWORD, btiUtil, 1);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4de6fe0..9323824 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -74,7 +74,8 @@ namespace ir {
static const Register dwblockip = Register(30); // blockip
static const Register laneid = Register(31); // lane id.
static const Register invalid = Register(32); // used for valid comparation.
- static const uint32_t regNum = 33; // number of special registers
+ static const Register btiUtil = Register(33); // used for mixed pointer as bti utility.
+ static const uint32_t regNum = 34; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 61b66b6..aec04fb 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -87,6 +87,7 @@
#endif /* LLVM_VERSION_MINOR <= 2 */
#include "llvm/Pass.h"
#include "llvm/PassManager.h"
+#include "llvm/IR/IRBuilder.h"
#if LLVM_VERSION_MINOR <= 2
#include "llvm/Intrinsics.h"
#include "llvm/IntrinsicInst.h"
@@ -290,11 +291,8 @@ namespace gbe
return ir::MEM_GLOBAL;
}
- static INLINE ir::AddressSpace btiToGen(const ir::BTI &bti) {
- if (bti.count > 1)
- return ir::MEM_MIXED;
- uint8_t singleBti = bti.bti[0];
- switch (singleBti) {
+ static INLINE ir::AddressSpace btiToGen(const unsigned bti) {
+ switch (bti) {
case BTI_CONSTANT: return ir::MEM_CONSTANT;
case BTI_PRIVATE: return ir::MEM_PRIVATE;
case BTI_LOCAL: return ir::MEM_LOCAL;
@@ -485,7 +483,14 @@ namespace gbe
map<Value *, SmallVector<Value *, 4>> pointerOrigMap;
typedef map<Value *, SmallVector<Value *, 4>>::iterator PtrOrigMapIter;
-
+ // map pointer source to bti
+ map<Value *, unsigned> BtiMap;
+ // map ptr to its bti register
+ map<Value *, Value *> BtiValueMap;
+ // map ptr to it's base
+ map<Value *, Value *> pointerBaseMap;
+
+ typedef map<Value *, Value *>::iterator PtrBaseMapIter;
/*! We visit each function twice. Once to allocate the registers and once to
* emit the Gen IR instructions
*/
@@ -501,6 +506,7 @@ namespace gbe
} ConstTypeId;
LoopInfo *LI;
+ Function *Func;
const Module *TheModule;
int btiBase;
public:
@@ -547,23 +553,35 @@ namespace gbe
bool bKernel = isKernelFunction(F);
if(!bKernel) return false;
+ Func = &F;
+ assignBti(F);
analyzePointerOrigin(F);
+
LI = &getAnalysis<LoopInfo>();
emitFunction(F);
phiMap.clear();
globalPointer.clear();
pointerOrigMap.clear();
+ BtiMap.clear();
+ BtiValueMap.clear();
+ pointerBaseMap.clear();
// Reset for next function
btiBase = BTI_RESERVED_NUM;
return false;
}
/*! Given a possible pointer value, find out the interested escape like
load/store or atomic instruction */
- void findPointerEscape(Value *ptr);
+ void findPointerEscape(Value *ptr, std::set<Value *> &mixedPtr, bool recordMixed);
/*! For all possible pointers, GlobalVariable, function pointer argument,
alloca instruction, find their pointer escape points */
void analyzePointerOrigin(Function &F);
+ unsigned getNewBti(Value *origin);
+ void assignBti(Function &F);
+ bool isSingleBti(Value *Val);
+ Value *getBtiRegister(Value *v);
+ Value *getPointerBase(Value *ptr);
+ MDNode *getKernelFunctionMetadata(Function *F);
virtual bool doFinalization(Module &M) { return false; }
/*! handle global variable register allocation (local, constant space) */
void allocateGlobalVariableRegister(Function &F);
@@ -660,10 +678,10 @@ namespace gbe
// batch vec4/8/16 load/store
INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
Value *llvmValue, const ir::Register ptr,
- const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
- bool dwAligned);
+ const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::Register bti,
+ bool dwAligned, bool fixedBTI);
// handle load of dword/qword with unaligned address
- void emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned);
+ void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI);
void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
private:
ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -675,7 +693,44 @@ namespace gbe
char GenWriter::ID = 0;
- void GenWriter::findPointerEscape(Value *ptr) {
+ static void updatePointerSource(Value *parent, Value *theUser, Value *source, SmallVector<Value *, 4> &pointers) {
+ if (isa<SelectInst>(theUser)) {
+ SelectInst *si = dyn_cast<SelectInst>(theUser);
+ if (si->getTrueValue() == parent)
+ pointers[0] = source;
+ else
+ pointers[1] = source;
+ } else if (isa<PHINode>(theUser)) {
+ PHINode *phi = dyn_cast<PHINode>(theUser);
+ unsigned opNum = phi->getNumIncomingValues();
+ for (unsigned j = 0; j < opNum; j++) {
+ if (phi->getIncomingValue(j) == parent) {
+ pointers[j] = source;
+ }
+ }
+ } else {
+ pointers[0] = source;
+ }
+ }
+
+ bool isMixedPoint(Value *val, SmallVector<Value *, 4> &pointers) {
+ Value *validSrc = NULL;
+ unsigned i = 0;
+ if (pointers.size() < 2) return false;
+ while(i < pointers.size()) {
+ if (pointers[i] != NULL && validSrc != NULL && pointers[i] != validSrc)
+ return true;
+ // when source is same as itself, we don't treat it as a new source
+ // this often occurs for PHINode
+ if (pointers[i] != NULL && validSrc == NULL && pointers[i] != val) {
+ validSrc = pointers[i];
+ }
+ i++;
+ }
+ return false;
+ }
+
+ void GenWriter::findPointerEscape(Value *ptr, std::set<Value *> &mixedPtr, bool bFirstPass) {
std::vector<Value*> workList;
std::set<Value *> visited;
@@ -695,7 +750,52 @@ namespace gbe
#else
User *theUser = iter->getUser();
#endif
- if (visited.find(theUser) != visited.end()) continue;
+ bool visitedInThisSource = visited.find(theUser) != visited.end();
+
+ if (isa<SelectInst>(theUser) || isa<PHINode>(theUser))
+ {
+ // reached from another source, update pointer source
+ PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+ if (ptrIter == pointerOrigMap.end()) {
+ // create new one
+ unsigned capacity = 1;
+ if (isa<SelectInst>(theUser)) capacity = 2;
+ if (isa<PHINode>(theUser)) {
+ PHINode *phi = dyn_cast<PHINode>(theUser);
+ capacity = phi->getNumIncomingValues();
+ }
+
+ SmallVector<Value *, 4> pointers;
+
+ unsigned k = 0;
+ while (k++ < capacity) {
+ pointers.push_back(NULL);
+ }
+
+ updatePointerSource(work, theUser, ptr, pointers);
+ pointerOrigMap.insert(std::make_pair(theUser, pointers));
+ } else {
+ // update pointer source
+ updatePointerSource(work, theUser, ptr, (*ptrIter).second);
+ }
+ ptrIter = pointerOrigMap.find(theUser);
+
+ if (isMixedPoint(theUser, (*ptrIter).second)) {
+ // for the first pass, we need to record the mixed point instruction.
+ // for the second pass, we don't need to go further, the reason is:
+ // we always use it's 'direct mixed pointer parent' as origin, if we don't
+ // stop here, we may set wrong pointer origin.
+ if (bFirstPass)
+ mixedPtr.insert(theUser);
+ else
+ continue;
+ }
+ // don't fall into dead loop,
+ if (visitedInThisSource || theUser == ptr) {
+ continue;
+ }
+ }
+
// pointer address is used as the ValueOperand in store instruction, should be skipped
if (StoreInst *load = dyn_cast<StoreInst>(theUser)) {
if (load->getValueOperand() == work) {
@@ -710,16 +810,33 @@ namespace gbe
Function *F = dyn_cast<CallInst>(theUser)->getCalledFunction();
if (!F || F->getIntrinsicID() != 0) continue;
}
+ Value *pointer = NULL;
+ if (isa<LoadInst>(theUser)) {
+ pointer = dyn_cast<LoadInst>(theUser)->getPointerOperand();
+ } else if (isa<StoreInst>(theUser)) {
+ pointer = dyn_cast<StoreInst>(theUser)->getPointerOperand();
+ } else if (isa<CallInst>(theUser)) {
+ // atomic/read(write)image
+ CallInst *ci = dyn_cast<CallInst>(theUser);
+ pointer = ci->getArgOperand(0);
+ } else {
+ theUser->dump();
+ GBE_ASSERT(0 && "Unknown instruction operating on pointers\n");
+ }
- PtrOrigMapIter ptrIter = pointerOrigMap.find(theUser);
+ // the pointer operand is same as pointer origin, don't add to pointerOrigMap
+ if (ptr == pointer) continue;
+
+ // load/store/atomic instruction, we have reached the end, stop further traversing
+ PtrOrigMapIter ptrIter = pointerOrigMap.find(pointer);
if (ptrIter == pointerOrigMap.end()) {
// create new one
SmallVector<Value *, 4> pointers;
pointers.push_back(ptr);
- pointerOrigMap.insert(std::make_pair(theUser, pointers));
+ pointerOrigMap.insert(std::make_pair(pointer, pointers));
} else {
- // append it
- (*ptrIter).second.push_back(ptr);
+ // update the pointer source here,
+ (*ptrIter).second[0] = ptr;
}
} else {
workList.push_back(theUser);
@@ -727,28 +844,307 @@ namespace gbe
}
}
}
+ bool GenWriter::isSingleBti(Value *Val) {
+ // self + others same --> single
+ // all same ---> single
+ if (!isa<SelectInst>(Val) && !isa<PHINode>(Val)) {
+ return true;
+ } else {
+ PtrOrigMapIter iter = pointerOrigMap.find(Val);
+ SmallVector<Value *, 4> &pointers = (*iter).second;
+ unsigned srcNum = pointers.size();
+ Value *source = NULL;
+ for (unsigned x = 0; x < srcNum; x++) {
+ // often happend in phiNode where one source is same as PHINode itself, skip it
+ if (pointers[x] == Val) continue;
+
+ if (source == NULL) source = pointers[x];
+ else {
+ if (source != pointers[x])
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+ Value *GenWriter::getPointerBase(Value *ptr) {
+ PtrBaseMapIter baseIter = pointerBaseMap.find(ptr);
+ if (baseIter != pointerBaseMap.end()) {
+ return baseIter->second;
+ }
+ typedef std::map<Value *, unsigned>::iterator BtiIter;
+ // for pointers that already assigned a bti, it is the base pointer,
+ BtiIter found = BtiMap.find(ptr);
+ if (found != BtiMap.end()) {
+ if (isa<PointerType>(ptr->getType())) {
+ PointerType *ty = cast<PointerType>(ptr->getType());
+ // only global pointer will have starting address
+ if (ty->getAddressSpace() == 1) {
+ return ptr;
+ } else {
+ return ConstantPointerNull::get(ty);
+ }
+ } else {
+ PointerType *ty = PointerType::get(ptr->getType(), 0);
+ return ConstantPointerNull::get(ty);
+ }
+ }
+
+ PtrOrigMapIter iter = pointerOrigMap.find(ptr);
+ SmallVector<Value *, 4> &pointers = (*iter).second;
+ if (isSingleBti(ptr)) {
+ Value *base = getPointerBase(pointers[0]);
+ pointerBaseMap.insert(std::make_pair(ptr, base));
+ return base;
+ } else {
+ if (isa<SelectInst>(ptr)) {
+ SelectInst *si = dyn_cast<SelectInst>(ptr);
+ IRBuilder<> Builder(si->getParent());
+
+ Value *trueVal = getPointerBase((*iter).second[0]);
+ Value *falseVal = getPointerBase((*iter).second[1]);
+ Builder.SetInsertPoint(si);
+ Value *base = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+ pointerBaseMap.insert(std::make_pair(ptr, base));
+ return base;
+ } else if (isa<PHINode>(ptr)) {
+ PHINode *phi = dyn_cast<PHINode>(ptr);
+ IRBuilder<> Builder(phi->getParent());
+ Builder.SetInsertPoint(phi);
+
+ PHINode *basePhi = Builder.CreatePHI(ptr->getType(), phi->getNumIncomingValues());
+ unsigned srcNum = pointers.size();
+ for (unsigned x = 0; x < srcNum; x++) {
+ Value *base = NULL;
+ if (pointers[x] != ptr) {
+ base = getPointerBase(pointers[x]);
+ } else {
+ base = basePhi;
+ }
+ IRBuilder<> Builder2(phi->getIncomingBlock(x));
+ BasicBlock *predBB = phi->getIncomingBlock(x);
+ if (predBB->getTerminator())
+ Builder2.SetInsertPoint(predBB->getTerminator());
+
+#if (LLVM_VERSION_MAJOR== 3 && LLVM_VERSION_MINOR < 6)
+ // llvm 3.5 and older version don't have CreateBitOrPointerCast() define
+ Type *srcTy = base->getType();
+ Type *dstTy = ptr->getType();
+ if (srcTy->isPointerTy() && dstTy->isIntegerTy())
+ base = Builder2.CreatePtrToInt(base, dstTy);
+ else if (srcTy->isIntegerTy() && dstTy->isPointerTy())
+ base = Builder2.CreateIntToPtr(base, dstTy);
+ else if (srcTy != dstTy)
+ base = Builder2.CreateBitCast(base, dstTy);
+#else
+ base = Builder2.CreateBitOrPointerCast(base, ptr->getType());
+#endif
+ basePhi->addIncoming(base, phi->getIncomingBlock(x));
+ }
+ pointerBaseMap.insert(std::make_pair(ptr, basePhi));
+ return basePhi;
+ } else {
+ ptr->dump();
+ GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+ return ptr;
+ }
+ }
+ }
+
+ Value *GenWriter::getBtiRegister(Value *Val) {
+ typedef std::map<Value *, unsigned>::iterator BtiIter;
+ typedef std::map<Value *, Value *>::iterator BtiValueIter;
+ BtiIter found = BtiMap.find(Val);
+ BtiValueIter valueIter = BtiValueMap.find(Val);
+ if (valueIter != BtiValueMap.end())
+ return valueIter->second;
+
+ if (found != BtiMap.end()) {
+ // the Val already got assigned an BTI, return it
+ Value *bti = ConstantInt::get(IntegerType::get(Val->getContext(), 32), found->second);
+ BtiValueMap.insert(std::make_pair(Val, bti));
+ return bti;
+ } else {
+ if (isSingleBti(Val)) {
+ PtrOrigMapIter iter = pointerOrigMap.find(Val);
+ Value * bti = getBtiRegister((*iter).second[0]);
+ BtiValueMap.insert(std::make_pair(Val, bti));
+ return bti;
+ } else {
+ if (isa<SelectInst>(Val)) {
+ SelectInst *si = dyn_cast<SelectInst>(Val);
+
+ IRBuilder<> Builder(si->getParent());
+ PtrOrigMapIter iter = pointerOrigMap.find(Val);
+ Value *trueVal = getBtiRegister((*iter).second[0]);
+ Value *falseVal = getBtiRegister((*iter).second[1]);
+ Builder.SetInsertPoint(si);
+ Value *bti = Builder.CreateSelect(si->getCondition(), trueVal, falseVal);
+ BtiValueMap.insert(std::make_pair(Val, bti));
+ return bti;
+ } else if (isa<PHINode>(Val)) {
+ PHINode *phi = dyn_cast<PHINode>(Val);
+ IRBuilder<> Builder(phi->getParent());
+ Builder.SetInsertPoint(phi);
+
+ PHINode *btiPhi = Builder.CreatePHI(IntegerType::get(Val->getContext(), 32), phi->getNumIncomingValues());
+ PtrOrigMapIter iter = pointerOrigMap.find(Val);
+ SmallVector<Value *, 4> &pointers = (*iter).second;
+ unsigned srcNum = pointers.size();
+ for (unsigned x = 0; x < srcNum; x++) {
+ Value *bti = NULL;
+ if (pointers[x] != Val) {
+ bti = getBtiRegister(pointers[x]);
+ } else {
+ bti = btiPhi;
+ }
+ btiPhi->addIncoming(bti, phi->getIncomingBlock(x));
+ }
+ BtiValueMap.insert(std::make_pair(Val, btiPhi));
+ return btiPhi;
+ } else {
+ Val->dump();
+ GBE_ASSERT(0 && "Unhandled instruction in getBtiRegister\n");
+ return Val;
+ }
+ }
+ }
+ }
+
+ unsigned GenWriter::getNewBti(Value *origin) {
+ unsigned new_bti = 0;
+ if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
+ new_bti = btiBase;
+ incBtiBase();
+ } else if (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+ new_bti = btiBase;
+ incBtiBase();
+ }
+ else if (isa<GlobalVariable>(origin)
+ && dyn_cast<GlobalVariable>(origin)->isConstant()) {
+ new_bti = BTI_CONSTANT;
+ } else {
+ unsigned space = origin->getType()->getPointerAddressSpace();
+ switch (space) {
+ case 0:
+ new_bti = BTI_PRIVATE;
+ break;
+ case 1:
+ {
+ new_bti = btiBase;
+ incBtiBase();
+ break;
+ }
+ case 2:
+ new_bti = BTI_CONSTANT;
+
+ break;
+ case 3:
+ new_bti = BTI_LOCAL;
+ break;
+ default:
+ GBE_ASSERT(0);
+ break;
+ }
+ }
+ return new_bti;
+ }
+
+ MDNode *GenWriter::getKernelFunctionMetadata(Function *F) {
+ NamedMDNode *clKernels = TheModule->getNamedMetadata("opencl.kernels");
+ uint32_t ops = clKernels->getNumOperands();
+ for(uint32_t x = 0; x < ops; x++) {
+ MDNode* node = clKernels->getOperand(x);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
+ Value * op = node->getOperand(0);
+#else
+ auto *V = cast<ValueAsMetadata>(node->getOperand(0));
+ Value *op = V ? V->getValue() : NULL;
+#endif
+ if(op == F) {
+ return node;
+ }
+ }
+ return NULL;
+ }
+
+ void GenWriter::assignBti(Function &F) {
+ Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
+ for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+ GlobalVariable &v = *i;
+ if(!v.isConstantUsed()) continue;
+
+ BtiMap.insert(std::make_pair(&v, getNewBti(&v)));
+ }
+ MDNode *typeNameNode = NULL;
+ MDNode *node = getKernelFunctionMetadata(&F);
+ for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+ MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+ if (attrNode == NULL) break;
+ MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+ if (!attrName) continue;
+ if (attrName->getString() == "kernel_arg_type") {
+ typeNameNode = attrNode;
+ }
+ }
+
+ unsigned argID = 0;
+ ir::FunctionArgument::InfoFromLLVM llvmInfo;
+ for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I, argID++) {
+ llvmInfo.typeName= (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+ if (I->getType()->isPointerTy() || llvmInfo.isImageType()) {
+ BtiMap.insert(std::make_pair(I, getNewBti(I)));
+ }
+ }
+
+ BasicBlock &bb = F.getEntryBlock();
+ for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
+ if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
+ BtiMap.insert(std::make_pair(ai, BTI_PRIVATE));
+ }
+ }
+ }
void GenWriter::analyzePointerOrigin(Function &F) {
+ // used to record where the pointers get mixed (i.e. select or phi instruction)
+ std::set<Value *> mixedPtr;
+ // This is a two-pass algorithm, the 1st pass will try to update the pointer sources for
+ // every instruction reachable from pointers and record mix-point in this pass.
+ // The second pass will start from really mixed-pointer instruction like select or phinode.
+ // and update the sources correctly. For pointers reachable from mixed-pointer, we will set
+ // its direct mixed-pointer parent as it's pointer origin.
+
// GlobalVariable
Module::GlobalListType &globalList = const_cast<Module::GlobalListType &> (TheModule->getGlobalList());
for(auto i = globalList.begin(); i != globalList.end(); i ++) {
GlobalVariable &v = *i;
if(!v.isConstantUsed()) continue;
- findPointerEscape(&v);
+ findPointerEscape(&v, mixedPtr, true);
}
// function argument
for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
if (I->getType()->isPointerTy()) {
- findPointerEscape(I);
+ findPointerEscape(I, mixedPtr, true);
}
}
// alloca
BasicBlock &bb = F.getEntryBlock();
for (BasicBlock::iterator iter = bb.begin(), iterE = bb.end(); iter != iterE; ++iter) {
if (AllocaInst *ai = dyn_cast<AllocaInst>(iter)) {
- findPointerEscape(ai);
+ findPointerEscape(ai, mixedPtr, true);
}
}
+ // the second pass starts from mixed pointer
+ for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+ findPointerEscape(*iter, mixedPtr, false);
+ }
+
+ for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+ getBtiRegister(*iter);
+ }
+ for (std::set<Value *>::iterator iter = mixedPtr.begin(); iter != mixedPtr.end(); ++iter) {
+ getPointerBase(*iter);
+ }
}
void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
@@ -1253,11 +1649,9 @@ namespace gbe
"Returned value for kernel functions is forbidden");
// Loop over the kernel metadatas to set the required work group size.
- NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
size_t reqd_wg_sz[3] = {0, 0, 0};
size_t hint_wg_sz[3] = {0, 0, 0};
ir::FunctionArgument::InfoFromLLVM llvmInfo;
- MDNode *node = NULL;
MDNode *addrSpaceNode = NULL;
MDNode *typeNameNode = NULL;
MDNode *accessQualNode = NULL;
@@ -1267,16 +1661,7 @@ namespace gbe
std::string functionAttributes;
/* First find the meta data belong to this function. */
- for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
- node = clKernelMetaDatas->getOperand(i);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
- if (node->getOperand(0) == &F) break;
-#else
- auto *V = cast<ValueAsMetadata>(node->getOperand(0));
- if (V && V->getValue() == &F) break;
-#endif
- node = NULL;
- }
+ MDNode *node = getKernelFunctionMetadata(&F);
/* because "-cl-kernel-arg-info", should always have meta data. */
if (!F.arg_empty())
@@ -1362,7 +1747,6 @@ namespace gbe
functionAttributes += " ";
}
}
- ctx.appendSurface(1, ir::ocl::stackbuffer);
ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
@@ -1419,7 +1803,7 @@ namespace gbe
const ir::Register reg = getRegister(I);
if (llvmInfo.isImageType()) {
ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, 4, 4, 0);
- ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
+ ctx.getFunction().getImageSet()->append(reg, &ctx, BtiMap.find(I)->second);
collectImageArgs(llvmInfo.accessQual, imageArgsInfo);
continue;
}
@@ -1452,10 +1836,7 @@ namespace gbe
const uint32_t align = getAlignmentByte(unit, pointed);
switch (addrSpace) {
case ir::MEM_GLOBAL:
- globalPointer.insert(std::make_pair(I, btiBase));
- ctx.appendSurface(btiBase, reg);
- ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
- incBtiBase();
+ ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, BtiMap.find(I)->second);
break;
case ir::MEM_LOCAL:
ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, llvmInfo, ptrSize, align, BTI_LOCAL);
@@ -1806,14 +2187,10 @@ namespace gbe
ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
} else {
if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
- ctx.appendSurface(btiBase, ir::ocl::printfbptr);
- ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
- globalPointer.insert(std::make_pair(&v, incBtiBase()));
+ ctx.getFunction().getPrintfSet()->setBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
} else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
- ctx.appendSurface(btiBase, ir::ocl::printfiptr);
- ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
- globalPointer.insert(std::make_pair(&v, incBtiBase()));
+ ctx.getFunction().getPrintfSet()->setIndexBufBTI(BtiMap.find(const_cast<GlobalVariable*>(&v))->second);
regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
} else if(v.getName().str().substr(0, 4) == ".str") {
/* When there are multi printf statements in multi kernel fucntions within the same
@@ -2045,6 +2422,7 @@ namespace gbe
}
ctx.startFunction(F.getName());
+
ir::Function &fn = ctx.getFunction();
this->regTranslator.clear();
this->labelMap.clear();
@@ -2838,19 +3216,46 @@ namespace gbe
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
+ ir::AddressSpace addrSpace;
+
+ Value *llvmPtr = *AI;
+ Value *bti = getBtiRegister(llvmPtr);
+ Value *ptrBase = getPointerBase(llvmPtr);
+ ir::Register pointer = this->getRegister(llvmPtr);
+ ir::Register baseReg = this->getRegister(ptrBase);
+
+ ir::Register btiReg;
+ bool fixedBTI = false;
+ if (isa<ConstantInt>(bti)) {
+ fixedBTI = true;
+ unsigned index = cast<ConstantInt>(bti)->getZExtValue();
+ addrSpace = btiToGen(index);
+ ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
+ btiReg = ctx.reg(ir::FAMILY_DWORD);
+ ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+ } else {
+ addrSpace = ir::MEM_MIXED;
+ btiReg = this->getRegister(bti);
+ }
+
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+ const ir::Register ptr = ctx.reg(pointerFamily);
+ ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+
const ir::Register dst = this->getRegister(&I);
- ir::BTI bti;
- gatherBTI(&I, bti);
- const ir::AddressSpace addrSpace = btiToGen(bti);
- vector<ir::Register> src;
uint32_t srcNum = 0;
+ vector<ir::Register> src;
+ src.push_back(ptr);
+ srcNum++;
+ AI++;
+
while(AI != AE) {
src.push_back(this->getRegister(*(AI++)));
srcNum++;
}
const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
- ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
+ ctx.ATOMIC(opcode, dst, addrSpace, btiReg, fixedBTI, srcTuple);
}
/* append a new sampler. should be called before any reference to
@@ -3555,8 +3960,8 @@ namespace gbe
void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
Value *llvmValues, const ir::Register ptr,
const ir::AddressSpace addrSpace,
- Type * elemType, bool isLoad, ir::BTI bti,
- bool dwAligned) {
+ Type * elemType, bool isLoad, ir::Register bti,
+ bool dwAligned, bool fixedBTI) {
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3602,79 +4007,18 @@ namespace gbe
// Emit the instruction
if (isLoad)
- ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
+ ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
else
- ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
- }
- }
-
- // The idea behind is to search along the use-def chain, and find out all
- // possible sources of the pointer. Then in later codeGen, we can emit
- // read/store instructions to these BTIs gathered.
- void GenWriter::gatherBTI(Value *insn, ir::BTI &bti) {
- PtrOrigMapIter iter = pointerOrigMap.find(insn);
- if (iter != pointerOrigMap.end()) {
- SmallVectorImpl<Value *> &origins = iter->second;
- uint8_t nBTI = 0;
- for (unsigned i = 0; i < origins.size(); i++) {
- uint8_t new_bti = 0;
- Value *origin = origins[i];
- // all constant put into constant cache, including __constant & const __private
- if (isa<GlobalVariable>(origin)
- && dyn_cast<GlobalVariable>(origin)->isConstant()) {
- new_bti = BTI_CONSTANT;
- } else {
- unsigned space = origin->getType()->getPointerAddressSpace();
- switch (space) {
- case 0:
- new_bti = BTI_PRIVATE;
- break;
- case 1:
- {
- GlobalPtrIter iter = globalPointer.find(origin);
- GBE_ASSERT(iter != globalPointer.end());
- new_bti = iter->second;
- break;
- }
- case 2:
- new_bti = BTI_CONSTANT;
- break;
- case 3:
- new_bti = BTI_LOCAL;
- break;
- default:
- GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
- break;
- }
- }
-
- // avoid duplicate
- bool bFound = false;
- for (int j = 0; j < nBTI; j++) {
- if (bti.bti[j] == new_bti) {
- bFound = true; break;
- }
- }
- if (bFound == false) {
- bti.bti[nBTI++] = new_bti;
- bti.count = nBTI;
- }
- }
- } else {
- insn->dump();
- std::cerr << "Illegal pointer which is not from a valid memory space." << std::endl;
- std::cerr << "Aborting..." << std::endl;
- exit(-1);
+ ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, fixedBTI, bti);
}
- GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
}
+
// handle load of dword/qword with unaligned address
- void GenWriter::emitUnalignedDQLoadStore(Value *llvmPtr, Value *llvmValues, ir::AddressSpace addrSpace, ir::BTI &binding, bool isLoad, bool dwAligned)
+ void GenWriter::emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool fixedBTI)
{
Type *llvmType = llvmValues->getType();
const ir::Type type = getType(ctx, llvmType);
unsigned byteSize = getTypeByteSize(unit, llvmType);
- const ir::Register ptr = this->getRegister(llvmPtr);
Type *elemType = llvmType;
unsigned elemNum = 1;
@@ -3704,13 +4048,13 @@ namespace gbe
const ir::Tuple byteTuple = ctx.arrayTuple(&byteTupleData[0], byteSize);
if (isLoad) {
- ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+ ctx.LOAD(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
ctx.BITCAST(type, ir::TYPE_U8, tuple, byteTuple, elemNum, byteSize);
} else {
ctx.BITCAST(ir::TYPE_U8, type, byteTuple, tuple, byteSize, elemNum);
// FIXME: byte scatter does not handle correctly vector store, after fix that,
// we can directly use on store instruction like:
- // ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, binding);
+ // ctx.STORE(ir::TYPE_U8, byteTuple, ptr, addrSpace, byteSize, dwAligned, fixedBTI, bti);
const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
for (uint32_t elemID = 0; elemID < byteSize; elemID++) {
const ir::Register reg = byteTupleData[elemID];
@@ -3725,7 +4069,7 @@ namespace gbe
ctx.LOADI(ir::TYPE_S32, offset, immIndex);
ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
}
- ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+ ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, bti, reg);
}
}
}
@@ -3738,10 +4082,31 @@ namespace gbe
Value *llvmValues = getLoadOrStoreValue(I);
Type *llvmType = llvmValues->getType();
const bool dwAligned = (I.getAlignment() % 4) == 0;
- const ir::Register ptr = this->getRegister(llvmPtr);
- ir::BTI binding;
- gatherBTI(&I, binding);
- const ir::AddressSpace addrSpace = btiToGen(binding);
+ ir::AddressSpace addrSpace;
+ const ir::Register pointer = this->getRegister(llvmPtr);
+ const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+
+ Value *bti = getBtiRegister(llvmPtr);
+ Value *ptrBase = getPointerBase(llvmPtr);
+ ir::Register baseReg = this->getRegister(ptrBase);
+ bool zeroBase = false;
+ if (isa<ConstantPointerNull>(ptrBase)) {
+ zeroBase = true;
+ }
+
+ ir::Register btiReg;
+ bool fixedBTI = false;
+ if (isa<ConstantInt>(bti)) {
+ fixedBTI = true;
+ unsigned index = cast<ConstantInt>(bti)->getZExtValue();
+ addrSpace = btiToGen(index);
+ ir::ImmediateIndex immIndex = ctx.newImmediate((uint32_t)index);
+ btiReg = ctx.reg(ir::FAMILY_DWORD);
+ ctx.LOADI(ir::TYPE_U32, btiReg, immIndex);
+ } else {
+ addrSpace = ir::MEM_MIXED;
+ btiReg = this->getRegister(bti);
+ }
Type *scalarType = llvmType;
if (!isScalarType(llvmType)) {
@@ -3749,11 +4114,20 @@ namespace gbe
scalarType = vectorType->getElementType();
}
+ ir::Register ptr = ctx.reg(pointerFamily);
+ // FIXME: avoid subtraction zero at this stage is not a good idea,
+ // but later ArgumentLower pass need to match exact load/addImm pattern
+ // so, I avoid subtracting zero base to satisfy ArgumentLower pass.
+ if (!zeroBase)
+ ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg);
+ else
+ ptr = pointer;
+
if (!dwAligned
&& (scalarType == IntegerType::get(I.getContext(), 64)
|| scalarType == IntegerType::get(I.getContext(), 32))
) {
- emitUnalignedDQLoadStore(llvmPtr, llvmValues, addrSpace, binding, isLoad, dwAligned);
+ emitUnalignedDQLoadStore(ptr, llvmValues, addrSpace, btiReg, isLoad, dwAligned, fixedBTI);
return;
}
// Scalar is easy. We neednot build register tuples
@@ -3761,9 +4135,9 @@ namespace gbe
const ir::Type type = getType(ctx, llvmType);
const ir::Register values = this->getRegister(llvmValues);
if (isLoad)
- ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
+ ctx.LOAD(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
else
- ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
+ ctx.STORE(type, ptr, addrSpace, dwAligned, fixedBTI, btiReg, values);
}
// A vector type requires to build a tuple
else {
@@ -3785,10 +4159,9 @@ namespace gbe
// The code is going to be fairly different from types to types (based on
// size of each vector element)
const ir::Type type = getType(ctx, elemType);
- const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
const ir::RegisterFamily dataFamily = getFamily(type);
- if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT && addrSpace != ir::MEM_MIXED) {
+ if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
// One message is enough here. Nothing special to do
if (elemNum <= 4) {
// Build the tuple data in the vector
@@ -3807,19 +4180,19 @@ namespace gbe
// Emit the instruction
if (isLoad)
- ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+ ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
else
- ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+ ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, fixedBTI, btiReg);
}
// Not supported by the hardware. So, we split the message and we use
// strided loads and stores
else {
- emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
+ emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
}
}
else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
(dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
- emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
+ emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, btiReg, dwAligned, fixedBTI);
} else {
for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
if(regTranslator.isUndefConst(llvmValues, elemID))
@@ -3839,9 +4212,9 @@ namespace gbe
ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
}
if (isLoad)
- ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
+ ctx.LOAD(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
else
- ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+ ctx.STORE(type, addr, addrSpace, dwAligned, fixedBTI, btiReg, reg);
}
}
}
--
2.3.6
More information about the Beignet
mailing list