[Beignet] [PATCH 2/3] GBE: Fix alignment according to OCL spec
Ruiling Song
ruiling.song at intel.com
Thu Nov 7 22:20:15 PST 2013
The patch simply store a 'align' for each kernel argument.
Then the runtime could align the kernel argument address to 'align'.
This patch works for constant and local address space.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/context.cpp | 2 ++
backend/src/backend/program.cpp | 10 ++++++++
backend/src/backend/program.h | 4 +++
backend/src/backend/program.hpp | 4 +++
backend/src/ir/context.cpp | 4 +--
backend/src/ir/context.hpp | 2 +-
backend/src/ir/function.hpp | 5 ++--
backend/src/llvm/llvm_gen_backend.cpp | 20 ++++++++-------
src/cl_command_queue_gen7.c | 43 +++++++++++++++++++++------------
9 files changed, 65 insertions(+), 29 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index cbeef24..2609e7a 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -534,6 +534,8 @@ namespace gbe
kernel->args = NULL;
for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
const auto &arg = fn.getArg(argID);
+
+ kernel->args[argID].align = arg.align;
switch (arg.type) {
case ir::FunctionArgument::VALUE:
case ir::FunctionArgument::STRUCTURE:
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 25be484..e692186 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -229,6 +229,7 @@ namespace gbe {
KernelArgument& arg = args[i];
OUT_UPDATE_SZ(arg.type);
OUT_UPDATE_SZ(arg.size);
+ OUT_UPDATE_SZ(arg.align);
OUT_UPDATE_SZ(arg.bufSize);
}
@@ -315,6 +316,7 @@ namespace gbe {
KernelArgument& arg = args[i];
IN_UPDATE_SZ(arg.type);
IN_UPDATE_SZ(arg.size);
+ IN_UPDATE_SZ(arg.align);
IN_UPDATE_SZ(arg.bufSize);
}
@@ -422,6 +424,7 @@ namespace gbe {
outs << spaces_nl << " Arg " << i << ":\n";
outs << spaces_nl << " type value: "<< arg.type << "\n";
outs << spaces_nl << " size: "<< arg.size << "\n";
+ outs << spaces_nl << " align: "<< arg.align << "\n";
outs << spaces_nl << " bufSize: "<< arg.bufSize << "\n";
}
@@ -696,6 +699,11 @@ namespace gbe {
return kernel->getArgSize(argID);
}
+ static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
+ if (genKernel == NULL) return 0u;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+ return kernel->getArgAlign(argID);
+ }
static gbe_arg_type kernelGetArgType(gbe_kernel genKernel, uint32_t argID) {
if (genKernel == NULL) return GBE_ARG_INVALID;
const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -803,6 +811,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
@@ -838,6 +847,7 @@ namespace gbe
gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+ gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 10fcc49..b0149da 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -185,6 +185,10 @@ extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
+/*! Get the align of the given argument */
+typedef uint32_t (gbe_kernel_get_arg_align_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align;
+
/*! Get the simd width for the kernel */
typedef uint32_t (gbe_kernel_get_simd_width_cb)(gbe_kernel);
extern gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 9b33b7c..dd76210 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -47,6 +47,7 @@ namespace gbe {
struct KernelArgument {
gbe_arg_type type; //!< Pointer, structure, image, regular value?
uint32_t size; //!< Size of the argument
+ uint32_t align; //!< addr alignment of the argument
uint32_t bufSize; //!< Contant buffer size
};
@@ -88,6 +89,9 @@ namespace gbe {
INLINE uint32_t getArgSize(uint32_t argID) const {
return argID >= argNum ? 0u : args[argID].size;
}
+ INLINE uint32_t getArgAlign(uint32_t argID) const {
+ return argID >= argNum ? 0u : args[argID].align;
+ }
/*! Return the type of the given argument */
INLINE gbe_arg_type getArgType(uint32_t argID) const {
return argID >= argNum ? GBE_ARG_INVALID : args[argID].type;
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index 400a2a0..d6815e1 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -105,10 +105,10 @@ namespace ir {
return index;
}
- void Context::input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elementSize) {
+ void Context::input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elementSize, uint32_t align) {
GBE_ASSERTM(fn != NULL, "No function currently defined");
GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
- FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name);
+ FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align);
fn->args.push_back(arg);
}
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index 242beaa..adeaf6f 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -107,7 +107,7 @@ namespace ir {
/*! Create a new label for the current function */
LabelIndex label(void);
/*! Append a new input register for the function */
- void input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elemSz = 0u);
+ void input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elemSz = 0u, uint32_t align = 0);
/*! Append a new output register for the function */
void output(Register reg);
/*! Get the immediate value */
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 3d4733d..84d2504 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -106,11 +106,12 @@ namespace ir {
SAMPLER = 6
};
/*! Create a function input argument */
- INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name) :
- type(type), reg(reg), size(size), name(name) {}
+ INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align) :
+ type(type), reg(reg), size(size), align(align), name(name) {}
Type type; //!< Gives the type of argument we have
Register reg; //!< Holds the argument
uint32_t size; //!< == sizeof(void*) for ptr, sizeof(elem) for the rest
+ uint32_t align; //!< address alignment for the argument
const std::string name; //!< Holds the function name for IR output
GBE_STRUCT(FunctionArgument); // Use custom allocator
};
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index a104df4..cb3511f 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -692,7 +692,8 @@ namespace gbe
void* mem = malloc(size);
uint32_t offset = 0;
getConstantData(c, mem, offset);
- unit.newConstant((char *)mem, name, size, sizeof(unsigned));
+ uint32_t alignment = getAlignmentByte(unit, type);
+ unit.newConstant((char *)mem, name, size, alignment);
free(mem);
}
}
@@ -1079,7 +1080,7 @@ namespace gbe
const uint32_t elemSize = getTypeByteSize(unit, elemType);
const uint32_t elemNum = vectorType->getNumElements();
//vector's elemType always scalar type
- ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize);
+ ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize, getAlignmentByte(unit, type));
ir::Function& fn = ctx.getFunction();
for(uint32_t i=1; i < elemNum; i++) {
@@ -1094,37 +1095,38 @@ namespace gbe
"vector type in the function argument is not supported yet");
const ir::Register reg = regTranslator.newScalar(I);
if (type->isPointerTy() == false)
- ctx.input(argName, ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type));
+ ctx.input(argName, ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type), getAlignmentByte(unit, type));
else {
PointerType *pointerType = dyn_cast<PointerType>(type);
+ Type *pointed = pointerType->getElementType();
// By value structure
#if LLVM_VERSION_MINOR <= 1
if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
#else
if (I->hasByValAttr()) {
#endif /* LLVM_VERSION_MINOR <= 1 */
- Type *pointed = pointerType->getElementType();
const size_t structSize = getTypeByteSize(unit, pointed);
- ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, structSize);
+ ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, structSize, getAlignmentByte(unit, type));
}
// Regular user provided pointer (global, local or constant)
else {
const uint32_t addr = pointerType->getAddressSpace();
const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(addr);
const uint32_t ptrSize = getTypeByteSize(unit, type);
+ const uint32_t align = getAlignmentByte(unit, pointed);
switch (addrSpace) {
case ir::MEM_GLOBAL:
- ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, ptrSize);
+ ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, ptrSize, align);
break;
case ir::MEM_LOCAL:
- ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, ptrSize);
+ ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, ptrSize, align);
ctx.getFunction().setUseSLM(true);
break;
case ir::MEM_CONSTANT:
- ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, ptrSize);
+ ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, ptrSize, align);
break;
case ir::IMAGE:
- ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize);
+ ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize, align);
ctx.getFunction().getImageSet()->append(reg, &ctx);
break;
default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index fb7a8a3..0788975 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -98,37 +98,47 @@ error:
static void
cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
{
- /* calculate constant buffer size */
+ /* calculate constant buffer size
+ * we need raw_size & aligned_size
+ */
GET_QUEUE_THREAD_GPGPU(queue);
int32_t arg;
- size_t offset;
+ size_t offset = 0;
+ uint32_t raw_size = 0, aligned_size =0;
gbe_program prog = ker->program->opaque;
const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
size_t global_const_size = gbe_program_get_global_constant_size(prog);
- uint32_t constant_buf_size = 0;
+ aligned_size = raw_size = global_const_size;
+ /* Reserve 8 bytes to get rid of 0 address */
+ if(global_const_size == 0) aligned_size = 8;
+
for (arg = 0; arg < arg_n; ++arg) {
const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+ uint32_t alignment = gbe_kernel_get_arg_align(ker->opaque, arg);
+ assert(alignment != 0);
cl_mem mem = ker->args[arg].mem;
- constant_buf_size += ALIGN(mem->size, 4);
+ raw_size += mem->size;
+ aligned_size = ALIGN(aligned_size, alignment);
+ aligned_size += mem->size;
}
}
- if(global_const_size == 0 && constant_buf_size == 0)
+ if(raw_size == 0)
return;
- cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, constant_buf_size + global_const_size + 4);
+ cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size);
cl_buffer_map(bo, 1);
char * cst_addr = cl_buffer_get_virtual(bo);
- offset = 0;
+
+ /* upload the global constant data */
if (global_const_size > 0) {
- /* Write the global constant arrays */
gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+ offset += global_const_size;
}
- offset += ALIGN(global_const_size, 4);
+ /* reserve 8 bytes to get rid of 0 address */
if(global_const_size == 0) {
- /* reserve 4 bytes to get rid of 0 address */
- offset += 4;
+ offset = 8;
}
/* upload constant buffer argument */
@@ -137,7 +147,8 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
cl_mem mem = ker->args[arg].mem;
-
+ uint32_t alignment = gbe_kernel_get_arg_align(ker->opaque, arg);
+ offset = ALIGN(offset, alignment);
curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
assert(curbe_offset >= 0);
*(uint32_t *) (ker->curbe + curbe_offset) = offset;
@@ -146,7 +157,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
void * addr = cl_buffer_get_virtual(mem->bo);
memcpy(cst_addr + offset, addr, mem->size);
cl_buffer_unmap(mem->bo);
- offset += ALIGN(mem->size, 4);
+ offset += mem->size;
}
}
cl_buffer_unmap(bo);
@@ -201,12 +212,14 @@ cl_curbe_fill(cl_kernel ker,
}
/* Handle the various offsets to SLM */
const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
- /* align so that we kernel argument get good alignment */
- int32_t arg, slm_offset = ALIGN(gbe_kernel_get_slm_size(ker->opaque), 32);
+ int32_t arg, slm_offset = gbe_kernel_get_slm_size(ker->opaque);
for (arg = 0; arg < arg_n; ++arg) {
const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
if (type != GBE_ARG_LOCAL_PTR)
continue;
+ uint32_t align = gbe_kernel_get_arg_align(ker->opaque, arg);
+ assert(align != 0);
+ slm_offset = ALIGN(slm_offset, align);
offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
assert(offset >= 0);
uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
--
1.7.9.5
More information about the Beignet
mailing list