[Beignet] [PATCH 13/14] Backend: Add subgroup short block read/write
Xiuli Pan
xiuli.pan at intel.com
Wed Oct 12 08:56:43 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Add intel subgroup short mem bleck read/write and image block read/write
also fix some old block read/write bug.
Refine old uint block read/write with _ui suffix.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen_context.cpp | 190 +++++++++++++++++--------
backend/src/backend/gen_encoder.cpp | 26 +++-
backend/src/backend/gen_insn_selection.cpp | 37 +++--
backend/src/ir/instruction.cpp | 26 ++--
backend/src/ir/instruction.hpp | 6 +-
backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 221 ++++++++++++++++++++++++-----
backend/src/libocl/tmpl/ocl_simd.tmpl.h | 48 ++++++-
backend/src/llvm/llvm_gen_backend.cpp | 125 +++++++++++-----
backend/src/llvm/llvm_gen_ocl_function.hxx | 50 ++++---
backend/src/llvm/llvm_scalarize.cpp | 42 ++++--
10 files changed, 573 insertions(+), 198 deletions(-)
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index a1ae5ea..6bb0f22 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3501,12 +3501,14 @@ namespace gbe
}
void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
- const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD);
+ const GenRegister dst= ra->genReg(insn.dst(1));
+ uint32_t type = dst.type;
+ uint32_t typesize = typeSize(type);
const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
const uint32_t vec_size = insn.extra.elem;
- const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), GEN_TYPE_UD);
+ const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type);
const uint32_t simdWidth = p->curr.execWidth;
// Make header
@@ -3532,7 +3534,7 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
+ p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
}
p->pop();
} else if (vec_size == 2) {
@@ -3540,14 +3542,41 @@ namespace gbe
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
+ p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
}
p->pop();
p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
- p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8));
- } else if (vec_size == 4 || vec_size == 8) {
+ p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize ));
+ } else if (vec_size == 4) {
if (simdWidth == 8) {
- for (uint32_t i = 0; i < vec_size / 4; i++) {
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
+ }
+ p->pop();
+ for (uint32_t j = 0; j < 4; j++)
+ p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
+ } else {
+ for (uint32_t i = 0; i < typesize / 2; i++) {
+ if (i > 0) {
+ p->push();
+ {
+ // Update the address in header
+ p->curr.execWidth = 1;
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
+ }
+ p->pop();
+ }
+ p->OBREAD(tmp, header, insn.getbti(), 8);
+ for (uint32_t j = 0; j < 8 / typesize ; j++)
+ p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
+ }
+ }
+ } else if (vec_size == 8) {
+ if (simdWidth == 8) {
+ for (uint32_t i = 0; i < typesize / 2; i++) {
if (i > 0) {
p->push();
{
@@ -3564,11 +3593,11 @@ namespace gbe
p->OBREAD(tmp, header, insn.getbti(), 8);
}
p->pop();
- for (uint32_t j = 0; j < 4; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, j));
+ for (uint32_t j = 0; j < 16 / typesize; j++)
+ p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
}
} else {
- for (uint32_t i = 0; i < vec_size / 2; i++) {
+ for (uint32_t i = 0; i < typesize ; i++) {
if (i > 0) {
p->push();
{
@@ -3579,8 +3608,8 @@ namespace gbe
p->pop();
}
p->OBREAD(tmp, header, insn.getbti(), 8);
- for (uint32_t j = 0; j < 2; j++)
- p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, j*2));
+ for (uint32_t j = 0; j < 8 / typesize; j++)
+ p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
}
}
} else NOT_SUPPORTED;
@@ -3590,6 +3619,8 @@ namespace gbe
const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD);
const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
+ uint32_t type = ra->genReg(insn.src(1)).type;
+ uint32_t typesize = typeSize(type);
const uint32_t vec_size = insn.extra.elem;
const GenRegister tmp = GenRegister::offset(header, 1);
const uint32_t simdWidth = p->curr.execWidth;
@@ -3613,29 +3644,56 @@ namespace gbe
p->pop();
// Now write the data, oword block write can only work with simd16 and no mask
if (vec_size == 1) {
- p->MOV(tmp, ra->genReg(insn.src(1)));
+ p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
p->push();
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), simdWidth / 4);
+ p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
}
p->pop();
} else if (vec_size == 2) {
- p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
- p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ;
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1)));
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2)));
p->push();
{
p->curr.execWidth = 16;
p->curr.noMask = 1;
- p->OBWRITE(header, insn.getbti(), simdWidth / 2);
+ p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
}
p->pop();
- } else if (vec_size == 4 || vec_size == 8) {
+ } else if (vec_size == 4) {
if (simdWidth == 8) {
- for (uint32_t i = 0; i < vec_size / 4; i++) {
- for (uint32_t j = 0; j < 4; j++)
- p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ;
+ for (uint32_t i = 0; i < 4; i++)
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i)));
+ p->push();
+ {
+ p->curr.execWidth = 16;
+ p->curr.noMask = 1;
+ p->OBWRITE(header, insn.getbti(), 2 * typesize);
+ }
+ p->pop();
+ } else {
+ for (uint32_t i = 0; i < typesize / 2; i++) {
+ for (uint32_t j = 0; j < 8 / typesize; j++)
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
+ if (i > 0) {
+ p->push();
+ {
+ // Update the address in header
+ p->curr.execWidth = 1;
+ p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+ }
+ p->pop();
+ }
+ p->OBWRITE(header, insn.getbti(), 8);
+ }
+ }
+ } else if (vec_size == 8) {
+ if (simdWidth == 8) {
+ for (uint32_t i = 0; i < typesize / 2; i++) {
+ for (uint32_t j = 0; j < 16 / typesize; j++)
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
if (i > 0) {
p->push();
{
@@ -3654,9 +3712,9 @@ namespace gbe
p->pop();
}
} else {
- for (uint32_t i = 0; i < vec_size / 2; i++) {
- for (uint32_t j = 0; j < 2; j++)
- p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ;
+ for (uint32_t i = 0; i < typesize; i++) {
+ for (uint32_t j = 0; j < 8 / typesize; j++)
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
if (i > 0) {
p->push();
{
@@ -3682,7 +3740,10 @@ namespace gbe
const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
size_t vec_size = insn.extra.elem;
- uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+ uint32_t type = dst.type;
+ uint32_t typesize = typeSize(type);
+ uint32_t block_width = typesize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
if (simdWidth == 8)
{
@@ -3699,9 +3760,12 @@ namespace gbe
p->MOV(offsety, coordy);
// Update block width and height
p->MOV(blocksizereg, GenRegister::immud(blocksize));
- // Now read the data
p->curr.execWidth = 8;
- p->MBREAD(dst, header, insn.getbti(), vec_size);
+ // ushort in simd8 will have half reg, but response lenght is still 1
+ uint32_t rsize = vec_size * typesize / 4;
+ rsize = rsize ? rsize : 1;
+ // Now read the data
+ p->MBREAD(dst, header, insn.getbti(), rsize);
p->pop();
}
@@ -3726,21 +3790,24 @@ namespace gbe
p->curr.execWidth = 8;
p->MBREAD(tmp, header, insn.getbti(), vec_size);
for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(ra->genReg(insn.dst(i + 1)), GenRegister::offset(tmp, i));
-
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- // Now read the data
- p->curr.execWidth = 8;
- p->MBREAD(tmp, header, insn.getbti(), vec_size);
+ p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i));
- // Move the reg to fit vector rule.
- for (uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
- GenRegister::offset(tmp, i));
+ if (typesize == 4)
+ {
+ // Second half
+ // Update the header with the coord
+ p->curr.execWidth = 1;
+ p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ // Now read the data
+ p->curr.execWidth = 8;
+ p->MBREAD(tmp, header, insn.getbti(), vec_size);
+
+ // Move the reg to fit vector rule.
+ for (uint32_t i = 0; i < vec_size; i++)
+ p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
+ GenRegister::offset(tmp, i));
+ }
p->pop();
} else NOT_IMPLEMENTED;
}
@@ -3749,9 +3816,13 @@ namespace gbe
const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D);
const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D);
const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD);
+ const GenRegister tmp = GenRegister::offset(header, 1);
GenRegister offsetx, offsety, blocksizereg;
size_t vec_size = insn.extra.elem;
- uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+ uint32_t type = ra->genReg(insn.src(2)).type;
+ uint32_t typesize = typeSize(type);
+ uint32_t block_width = typesize * simdWidth;
+ uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
offsetx = GenRegister::offset(header, 0, 0*4);
offsety = GenRegister::offset(header, 0, 1*4);
@@ -3775,9 +3846,13 @@ namespace gbe
p->curr.execWidth = 8;
// Mov what we need into msgs
for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
+ p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type),
+ ra->genReg(insn.src(2 + i)));
+ // ushort in simd8 will have half reg, but reponse lenght is still 1
+ uint32_t rsize = vec_size * typesize / 4;
+ rsize = rsize ? rsize : 1;
// Now read the data
- p->MBWRITE(header, insn.getbti(), vec_size);
+ p->MBWRITE(header, insn.getbti(), rsize);
p->pop();
}
@@ -3801,20 +3876,23 @@ namespace gbe
p->curr.execWidth = 8;
// Mov what we need into msgs
for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i)));
+ p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
p->MBWRITE(header, insn.getbti(), vec_size);
- // Second half
- // Update the header with the coord
- p->curr.execWidth = 1;
- p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
- p->curr.execWidth = 8;
- // Mov what we need into msgs
- for(uint32_t i = 0; i < vec_size; i++)
- p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
- // Now write the data
- p->MBWRITE(header, insn.getbti(), vec_size);
+ if (typesize == 4)
+ {
+ // Second half
+ // Update the header with the coord
+ p->curr.execWidth = 1;
+ p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+ p->curr.execWidth = 8;
+ // Mov what we need into msgs
+ for(uint32_t i = 0; i < vec_size; i++)
+ p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
+ // Now write the data
+ p->MBWRITE(header, insn.getbti(), vec_size);
+ }
p->pop();
}
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 975e1c7..a6f8db8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -269,10 +269,10 @@ namespace gbe
{
const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
p->setMessageDescriptor(insn, sfid, msg_length, response_length);
- assert(size == 2 || size == 4 || size == 8);
+ assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
insn->bits3.gen7_oblock_rw.msg_type = msg_type;
insn->bits3.gen7_oblock_rw.bti = bti;
- insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4);
+ insn->bits3.gen7_oblock_rw.block_size = size <= 2 ? size : (size == 4 ? 3 : 4);
insn->bits3.gen7_oblock_rw.header_present = 1;
}
@@ -1261,7 +1261,17 @@ namespace gbe
void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
const uint32_t msg_length = 1;
- const uint32_t response_length = size / 2; // Size is in regs
+ uint32_t rsize = size / 2;
+ uint32_t msgsize = size;
+ // When size is 1 OWord, which means half a reg, we need to know which half to use
+ if (size == 1) {
+ if (dst.subnr == 0)
+ msgsize = 0;
+ else
+ msgsize = 1;
+ }
+ rsize = rsize == 0 ? 1 : rsize;
+ const uint32_t response_length = rsize; // Size is in regs
this->setHeader(insn);
this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
@@ -1269,7 +1279,7 @@ namespace gbe
setOBlockRW(this,
insn,
bti,
- size,
+ msgsize,
GEN7_UNALIGNED_OBLOCK_READ,
msg_length,
response_length);
@@ -1277,8 +1287,12 @@ namespace gbe
void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
- const uint32_t msg_length = 1 + size / 2; // Size is in owords
+ uint32_t rsize = size / 2;
+ rsize = rsize == 0 ? 1 : rsize;
+ const uint32_t msg_length = 1 + rsize; // Size is in owords
const uint32_t response_length = 0;
+ uint32_t msgsize = size;
+ msgsize = msgsize == 1 ? 0 : msgsize;
this->setHeader(insn);
this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
this->setSrc1(insn, GenRegister::immud(0));
@@ -1286,7 +1300,7 @@ namespace gbe
setOBlockRW(this,
insn,
bti,
- size,
+ msgsize,
GEN7_OBLOCK_WRITE,
msg_length,
response_length);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d506d96..475cad8 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2089,7 +2089,6 @@ namespace gbe
uint32_t simdWidth = curr.execWidth;
SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2);
-
insn->dst(0) = header;
for (uint32_t i = 0; i < vec_size; ++i) {
insn->dst(i + 1) = dsts[i];
@@ -4147,16 +4146,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
const uint32_t vec_size = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ const Type type = insn.getValueType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
vector<GenRegister> valuesVec;
for(uint32_t i = 0; i < vec_size; i++)
- valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+ valuesVec.push_back(sel.selReg(insn.getValue(i), type));
// check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size / 8;
+ uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
+ tmp_size = tmp_size == 0 ? 1 : tmp_size;
tmp_size = tmp_size > 4 ? 4 : tmp_size;
vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size);
}
@@ -4332,16 +4334,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
const uint32_t vec_size = insn.getValueNum();
const uint32_t simdWidth = sel.ctx.getSimdWidth();
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ const Type type = insn.getValueType();
+ const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+ const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
vector<GenRegister> valuesVec;
for(uint32_t i = 0; i < vec_size; i++)
- valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+ valuesVec.push_back(sel.selReg(insn.getValue(i), type));
// check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
- uint32_t tmp_size = simdWidth * vec_size / 8;
+ uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
+ tmp_size = tmp_size == 0 ? 1 : tmp_size;
tmp_size = tmp_size > 4 ? 4 : tmp_size;
vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < tmp_size; i++)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size);
}
@@ -6703,16 +6708,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
using namespace ir;
uint32_t vec_size = insn.getVectorSize();
uint32_t simdWidth = sel.curr.execWidth;
+ const Type type = insn.getType();
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
for (uint32_t i = 0; i < vec_size; ++i) {
- valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32));
+ valuesVec.push_back(sel.selReg(insn.getDst(i), type));
if(simdWidth == 16)
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
}
const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
GenRegister *tmp = NULL;
if(simdWidth == 16)
tmp = &tmpVec[0];
@@ -6729,16 +6735,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
{
using namespace ir;
uint32_t vec_size = insn.getVectorSize();
+ const Type type = insn.getType();
const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
vector<GenRegister> valuesVec;
vector<GenRegister> tmpVec;
for(uint32_t i = 0; i < vec_size; i++)
{
- valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
- tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32));
+ valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+ tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD));
}
- const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+ const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size);
return true;
}
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 08a94cd..512055c 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1070,18 +1070,20 @@ namespace ir {
public TupleDstPolicy<MediaBlockReadInstruction>
{
public:
- INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) {
+ INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type) {
this->opcode = OP_MBREAD;
this->dst = dst;
this->dstNum = vec_size;
this->src = srcTuple;
this->srcNum = srcNum;
this->imageIdx = imageIdx;
+ this->type = type;
}
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
- out << (int)this->getVectorSize();
+ out << "." << type << "."
+ << (int)this->getVectorSize();
out << " {";
for (uint32_t i = 0; i < dstNum; ++i)
out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
@@ -1092,12 +1094,14 @@ namespace ir {
}
INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
+ INLINE Type getType(void) const { return this->type; }
Tuple src;
Tuple dst;
uint8_t imageIdx;
uint8_t srcNum;
uint8_t dstNum;
+ Type type;
};
class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
@@ -1107,17 +1111,19 @@ namespace ir {
{
public:
- INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
+ INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
this->opcode = OP_MBWRITE;
this->src = srcTuple;
this->srcNum = srcNum;
this->imageIdx = imageIdx;
this->vec_size = vec_size;
+ this->type = type;
}
INLINE bool wellFormed(const Function &fn, std::string &why) const;
INLINE void out(std::ostream &out, const Function &fn) const {
this->outOpcode(out);
- out << (int)this->getVectorSize()
+ out << "." << type << "."
+ << (int)this->getVectorSize()
<< " 2D surface id " << (int)this->getImageIndex()
<< " byte coord x %" << this->getSrc(fn, 0)
<< " row coord y %" << this->getSrc(fn, 1);
@@ -1128,12 +1134,14 @@ namespace ir {
}
INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
+ INLINE Type getType(void) const { return this->type; }
Tuple src;
Register dst[0];
uint8_t imageIdx;
uint8_t srcNum;
uint8_t vec_size;
+ Type type;
};
#undef ALIGNED_INSTRUCTION
@@ -2375,8 +2383,10 @@ DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID))
DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex())
DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType())
DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize())
+DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType())
#undef DECL_MEM_FN
@@ -2684,12 +2694,12 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg))
return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert();
}
- Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) {
- return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert();
+ Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type) {
+ return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type).convert();
}
- Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) {
- return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert();
+ Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) {
+ return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type).convert();
}
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index b2b0b49..98cead1 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -642,6 +642,7 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
uint8_t getImageIndex() const;
uint8_t getVectorSize() const;
+ Type getType(void) const;
};
/*! Media Block Write. */
@@ -651,6 +652,7 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
uint8_t getImageIndex() const;
uint8_t getVectorSize() const;
+ Type getType(void) const;
};
/*! Specialize the instruction. Also performs typechecking first based on the
@@ -886,9 +888,9 @@ namespace ir {
/*! printf */
Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num);
/*! media block read */
- Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum);
+ Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type);
/*! media block write */
- Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size);
+ Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type);
} /* namespace ir */
} /* namespace gbe */
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 9023107..97e33fe 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -188,90 +188,237 @@ INTEL_RANGE_OP(scan_exclusive, max, short, true)
INTEL_RANGE_OP(scan_exclusive, max, ushort, false)
#undef INTEL_RANGE_OP
-PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_mem(const global uint* p);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_mem2(const global uint* p);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_mem4(const global uint* p);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_mem8(const global uint* p);
OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem(p);
+ return __gen_ocl_sub_group_block_read_ui_mem(p);
}
OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem2(p);
+ return __gen_ocl_sub_group_block_read_ui_mem2(p);
}
OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem4(p);
-
+ return __gen_ocl_sub_group_block_read_ui_mem4(p);
}
OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
{
- return __gen_ocl_sub_group_block_read_mem8(p);
+ return __gen_ocl_sub_group_block_read_ui_mem8(p);
+}
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p)
+{
+ return __gen_ocl_sub_group_block_read_ui_mem(p);
}
-void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data);
-void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data);
-void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data);
-void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data);
-OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data)
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p)
{
- __gen_ocl_sub_group_block_write_mem(p, data);
+ return __gen_ocl_sub_group_block_read_ui_mem2(p);
}
-OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data)
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p)
{
- __gen_ocl_sub_group_block_write_mem2(p, data);
+ return __gen_ocl_sub_group_block_read_ui_mem4(p);
}
-OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p)
{
- __gen_ocl_sub_group_block_write_mem4(p, data);
+ return __gen_ocl_sub_group_block_read_ui_mem8(p);
+}
+void __gen_ocl_sub_group_block_write_ui_mem(global uint* p, uint data);
+void __gen_ocl_sub_group_block_write_ui_mem2(global uint* p, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_mem4(global uint* p, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_mem8(global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(global uint* p, uint data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write2(global uint* p, uint2 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write4(global uint* p,uint4 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem4(p, data);
}
-OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+OVERLOADABLE void intel_sub_group_block_write8(global uint* p,uint8 data)
{
- __gen_ocl_sub_group_block_write_mem8(p, data);
+ __gen_ocl_sub_group_block_write_ui_mem8(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui(global uint* p, uint data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui2(global uint* p, uint2 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui4(global uint* p,uint4 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data)
+{
+ __gen_ocl_sub_group_block_write_ui_mem8(p, data);
}
-PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y);
OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
}
OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
}
OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
}
OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
{
- return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
+ return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
}
-void __gen_ocl_sub_group_block_write_image(image2d_t p, int x, int y, uint data);
-void __gen_ocl_sub_group_block_write_image2(image2d_t p, int x, int y, uint2 data);
-void __gen_ocl_sub_group_block_write_image4(image2d_t p, int x, int y, uint4 data);
-void __gen_ocl_sub_group_block_write_image8(image2d_t p, int x, int y, uint8 data);
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
+}
+
+void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data);
+void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, uint8 data);
OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data)
{
- __gen_ocl_sub_group_block_write_image(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
}
OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data)
{
- __gen_ocl_sub_group_block_write_image2(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
}
OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data)
{
- __gen_ocl_sub_group_block_write_image4(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
}
OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data)
{
- __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
+ __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint data)
+{
+ __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, uint2 data)
+{
+ __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, uint4 data)
+{
+ __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, uint8 data)
+{
+ __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
}
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* p);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_mem2(const global ushort* p);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_mem4(const global ushort* p);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_mem8(const global ushort* p);
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem(p);
+}
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem2(p);
+}
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem4(p);
+}
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p)
+{
+ return __gen_ocl_sub_group_block_read_us_mem8(p);
+}
+
+void __gen_ocl_sub_group_block_write_us_mem(global ushort* p, ushort data);
+void __gen_ocl_sub_group_block_write_us_mem2(global ushort* p, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_mem4(global ushort* p, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_mem8(global ushort* p, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(global ushort* p, ushort data)
+{
+ __gen_ocl_sub_group_block_write_us_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us2(global ushort* p, ushort2 data)
+{
+ __gen_ocl_sub_group_block_write_us_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us4(global ushort* p,ushort4 data)
+{
+ __gen_ocl_sub_group_block_write_us_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 data)
+{
+ __gen_ocl_sub_group_block_write_us_mem8(p, data);
+}
+
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y);
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord)
+{
+ return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y);
+}
+
+void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data);
+void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, ushort data)
+{
+ __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, ushort2 data)
+{
+ __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, ushort4 data)
+{
+ __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort8 data)
+{
+ __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data);
+}
#define SHUFFLE_DOWN(TYPE) \
OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
TYPE res0, res1; \
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 158c8e1..608551b 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -196,10 +196,10 @@ OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p);
OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
-OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data);
-OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data);
-OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data);
-OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(__global uint* p, uint8 data);
OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord);
@@ -210,3 +210,43 @@ OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord,
OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data);
OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data);
OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p);
+
+OVERLOADABLE void intel_sub_group_block_write_ui(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(__global uint* p, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t image, int2 byte_coord);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t image, int2 byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t image, int2 byte_coord, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 byte_coord, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 byte_coord, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 byte_coord, uint8 data);
+
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p);
+
+OVERLOADABLE void intel_sub_group_block_write_us(__global ushort* p, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(__global ushort* p, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(__global ushort* p, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(__global ushort* p, ushort8 data);
+
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t image, int2 byte_coord);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t image, int2 byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 byte_coord, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coord, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 43c7c4c..a6a249d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -700,8 +700,8 @@ namespace gbe
// Emit subgroup instructions
void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
// Emit subgroup instructions
- void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
- void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size);
+ void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
+ void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32);
uint8_t appendSampler(CallSite::arg_iterator AI);
uint8_t getImageID(CallInst &I);
@@ -3853,14 +3853,22 @@ namespace gbe
case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
case GEN_OCL_LRP:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
this->newRegister(&I);
break;
case GEN_OCL_PRINTF:
@@ -3877,14 +3885,22 @@ namespace gbe
case GEN_OCL_CALC_TIMESTAMP:
case GEN_OCL_STORE_PROFILING:
case GEN_OCL_DEBUGWAIT:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
break;
case GEN_OCL_NOT_FOUND:
default:
@@ -4077,7 +4093,7 @@ namespace gbe
GBE_ASSERT(AI == AE);
}
- void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+ void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
@@ -4113,7 +4129,6 @@ namespace gbe
ptr = pointer;
}
- ir::Type type = ir::TYPE_U32;
GBE_ASSERT(AM != ir::AM_DynamicBti);
if(isWrite){
@@ -4134,7 +4149,7 @@ namespace gbe
GBE_ASSERT(AI == AE);
}
- void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) {
+ void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) {
CallSite::arg_iterator AI = CS.arg_begin();
CallSite::arg_iterator AE = CS.arg_end();
GBE_ASSERT(AI != AE);
@@ -4150,7 +4165,7 @@ namespace gbe
srcTupleData.push_back(getRegister(*(AI), i));
AI++;
const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size);
- ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
+ ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type);
} else {
ir::Register src[2];
src[0] = getRegister(*(AI++));
@@ -4160,7 +4175,7 @@ namespace gbe
dstTupleData.push_back(getRegister(&I, i));
const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
- ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
+ ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type);
}
GBE_ASSERT(AI == AE);
@@ -4993,38 +5008,70 @@ namespace gbe
ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
break;
}
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM:
this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE:
this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM:
+ this->emitBlockReadWriteMemInst(I, CS, false, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+ this->emitBlockReadWriteMemInst(I, CS, false, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+ this->emitBlockReadWriteMemInst(I, CS, false, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+ this->emitBlockReadWriteMemInst(I, CS, false, 8, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+ this->emitBlockReadWriteMemInst(I, CS, true, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+ this->emitBlockReadWriteMemInst(I, CS, true, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+ this->emitBlockReadWriteMemInst(I, CS, true, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
+ this->emitBlockReadWriteMemInst(I, CS, true, 8, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE:
+ this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+ this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+ this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
+ this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+ this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+ this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+ this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); break;
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
+ this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); break;
default: break;
}
}
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 48a72d1..8ab4373 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -217,22 +217,38 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max)
DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8)
-
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM, __gen_ocl_sub_group_block_read_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM2, __gen_ocl_sub_group_block_read_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM4, __gen_ocl_sub_group_block_read_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM8, __gen_ocl_sub_group_block_read_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM, __gen_ocl_sub_group_block_write_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM2, __gen_ocl_sub_group_block_write_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM4, __gen_ocl_sub_group_block_write_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM8, __gen_ocl_sub_group_block_write_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE, __gen_ocl_sub_group_block_read_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE2, __gen_ocl_sub_group_block_read_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE4, __gen_ocl_sub_group_block_read_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE8, __gen_ocl_sub_group_block_read_ui_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE, __gen_ocl_sub_group_block_write_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE2, __gen_ocl_sub_group_block_write_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE4, __gen_ocl_sub_group_block_write_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE8, __gen_ocl_sub_group_block_write_ui_image8)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM, __gen_ocl_sub_group_block_read_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM2, __gen_ocl_sub_group_block_read_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM4, __gen_ocl_sub_group_block_read_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM8, __gen_ocl_sub_group_block_read_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM, __gen_ocl_sub_group_block_write_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM2, __gen_ocl_sub_group_block_write_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM4, __gen_ocl_sub_group_block_write_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM8, __gen_ocl_sub_group_block_write_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, __gen_ocl_sub_group_block_read_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, __gen_ocl_sub_group_block_read_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, __gen_ocl_sub_group_block_read_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, __gen_ocl_sub_group_block_read_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block_write_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8)
// common function
DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 615fb50..8850abb 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -682,10 +682,14 @@ namespace gbe {
*CI = InsertToVector(call, *CI);
break;
}
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
{
++CI;
++CI;
@@ -693,22 +697,32 @@ namespace gbe {
*CI = InsertToVector(call, *CI);
break;
}
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
{
if ((*CI)->getType()->isVectorTy())
*CI = InsertToVector(call, *CI);
break;
}
case GEN_OCL_VME:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
- case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+ case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
setAppendPoint(call);
extractFromVector(call);
break;
--
2.7.4
More information about the Beignet
mailing list