[Beignet] [PATCH 1/2] Implement extension cl_intel_device_side_avc_motion_estimation.
Chuanbo Weng
chuanbo.weng at intel.com
Wed Apr 26 07:20:01 UTC 2017
This patch mainly contains:
1. built-in function __gen_ocl_ime implementation.
2. Lots of built-in functions of cl_intel_device_side_avc_motion_estimation
are implemented.
3. Three utest cases.
Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
backend/src/backend/gen/gen_mesa_disasm.c | 24 +-
backend/src/backend/gen8_instruction.hpp | 15 +
backend/src/backend/gen9_context.cpp | 105 ++
backend/src/backend/gen9_context.hpp | 1 +
backend/src/backend/gen9_encoder.cpp | 46 +
backend/src/backend/gen9_encoder.hpp | 9 +
backend/src/backend/gen_context.cpp | 56 +-
backend/src/backend/gen_context.hpp | 1 +
backend/src/backend/gen_defs.hpp | 1 +
backend/src/backend/gen_encoder.cpp | 8 +
backend/src/backend/gen_encoder.hpp | 4 +
.../src/backend/gen_insn_gen7_schedule_info.hxx | 1 +
backend/src/backend/gen_insn_selection.cpp | 63 +
backend/src/backend/gen_insn_selection.hpp | 12 +-
backend/src/backend/gen_insn_selection.hxx | 1 +
backend/src/ir/instruction.cpp | 56 +
backend/src/ir/instruction.hpp | 14 +-
backend/src/ir/instruction.hxx | 1 +
backend/src/ir/liveness.cpp | 1 +
backend/src/libocl/include/ocl_misc.h | 364 ++++++
backend/src/libocl/src/ocl_misc.cl | 1325 ++++++++++++++++++++
backend/src/llvm/llvm_gen_backend.cpp | 36 +
backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
backend/src/llvm/llvm_scalarize.cpp | 1 +
kernels/compiler_block_motion_estimate_intel.cl | 76 ++
kernels/compiler_intra_prediction.cl | 91 ++
kernels/compiler_skip_check.cl | 53 +
src/cl_command_queue.c | 7 +
src/cl_device_id.c | 8 +-
src/cl_extensions.c | 2 +-
src/cl_extensions.h | 5 +-
src/intel/intel_gpgpu.c | 70 ++
src/intel/intel_structs.h | 63 +
utests/CMakeLists.txt | 10 +-
utests/compiler_block_motion_estimate_intel.cpp | 156 +++
utests/compiler_intra_prediction.cpp | 116 ++
utests/compiler_skip_check.cpp | 190 +++
utests/utest_helper.cpp | 18 +
utests/utest_helper.hpp | 3 +
39 files changed, 2976 insertions(+), 38 deletions(-)
create mode 100644 kernels/compiler_block_motion_estimate_intel.cl
create mode 100644 kernels/compiler_intra_prediction.cl
create mode 100644 kernels/compiler_skip_check.cl
create mode 100644 utests/compiler_block_motion_estimate_intel.cpp
create mode 100644 utests/compiler_intra_prediction.cpp
create mode 100644 utests/compiler_skip_check.cpp
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 8a2afe5..ca36afa 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -370,6 +370,7 @@ static const char *target_function_gen75[16] = {
[GEN_SFID_DATAPORT_DATA] = "data (0)",
[GEN_SFID_PIXEL_INTERPOLATOR] = "pix_interpolator",
[GEN_SFID_DATAPORT1_DATA] = "data (1)",
+ [GEN_SFID_CHECK_REFINE] = "check_and_refine",
};
static const char *gateway_sub_function[8] = {
@@ -527,6 +528,13 @@ static int gen_version;
bits; \
})
+#define GEN8_BITS_FIELD(inst, gen8) \
+ ({ \
+ int bits; \
+ bits = ((const union Gen8NativeInstruction *)inst)->gen8; \
+ bits; \
+ })
+
#define GEN_BITS_FIELD(inst, gen) \
({ \
int bits; \
@@ -583,6 +591,8 @@ static int gen_version;
#define BRANCH_UIP(inst) GEN_BITS_FIELD2(inst, bits3.gen7_branch.uip, bits2.gen8_branch.uip/8)
#define VME_BTI(inst) GEN7_BITS_FIELD(inst, bits3.vme_gen7.bti)
#define VME_MSG_TYPE(inst) GEN7_BITS_FIELD(inst, bits3.vme_gen7.msg_type)
+#define IME_BTI(inst) GEN8_BITS_FIELD(inst, bits3.ime_gen8.bti)
+#define IME_MSG_TYPE(inst) GEN8_BITS_FIELD(inst, bits3.ime_gen8.msg_type)
#define SAMPLE_BTI(inst) GEN_BITS_FIELD(inst, bits3.sampler_gen7.bti)
#define SAMPLER(inst) GEN_BITS_FIELD(inst, bits3.sampler_gen7.sampler)
#define SAMPLER_MSG_TYPE(inst) GEN_BITS_FIELD(inst, bits3.sampler_gen7.msg_type)
@@ -1510,9 +1520,19 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
if (immbti) {
switch (target) {
case GEN_SFID_VIDEO_MOTION_EST:
+ if(gen_version == 7)
+ format(file, " (bti: %d, msg_type: %d)",
+ VME_BTI(inst),
+ VME_MSG_TYPE(inst));
+ else if(gen_version == 9)
+ format(file, " (bti: %d, msg_type: %d)",
+ IME_BTI(inst),
+ IME_MSG_TYPE(inst));
+ break;
+ case GEN_SFID_CHECK_REFINE:
format(file, " (bti: %d, msg_type: %d)",
- VME_BTI(inst),
- VME_MSG_TYPE(inst));
+ IME_BTI(inst),
+ IME_MSG_TYPE(inst));
break;
case GEN_SFID_SAMPLER:
format(file, " (%d, %d, %d, %d)",
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
index 446e7f9..79e1b09 100644
--- a/backend/src/backend/gen8_instruction.hpp
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -430,6 +430,21 @@ union Gen8NativeInstruction
uint32_t end_of_thread:1;
} sampler_gen7;
+ struct {
+ uint32_t bti:8;
+ uint32_t pad0:5;
+ uint32_t msg_type:2;
+ uint32_t stream_out_enable:1;
+ uint32_t stream_in_enable:1;
+ uint32_t stream_out_enable2:1;
+ uint32_t pad1:1;
+ uint32_t header_present:1;
+ uint32_t response_length:5;
+ uint32_t msg_length:4;
+ uint32_t pad2:2;
+ uint32_t end_of_thread:1;
+ } ime_gen8;
+
/**
* Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
*
diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp
index 483b2c3..1de659b 100644
--- a/backend/src/backend/gen9_context.cpp
+++ b/backend/src/backend/gen9_context.cpp
@@ -62,6 +62,111 @@ namespace gbe
}
}
+ void Gen9Context::emitImeInstruction(const SelectionInstruction &insn) {
+ const GenRegister dst = ra->genReg(insn.dst(0));
+ const unsigned int msg_type = insn.extra.ime_msg_type;
+
+ GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3);
+ uint32_t execWidth_org = p->curr.execWidth;
+ int virt_pld_len;
+ int phi_pld_len;
+ int virt_rsp_len;
+
+#define PHI_SIC_PAYLOAD_LEN 8
+#define PHI_IME_PAYLOAD_LEN 6
+#define PHI_VME_WRITEBACK_LEN 7
+
+ if(msg_type == 1 || msg_type == 2 || msg_type == 3)
+ virt_rsp_len = PHI_VME_WRITEBACK_LEN;
+ if(msg_type == 1 || msg_type == 3)
+ phi_pld_len = PHI_SIC_PAYLOAD_LEN;
+ else if(msg_type == 2)
+ phi_pld_len = PHI_IME_PAYLOAD_LEN;
+ if(execWidth_org == 8)
+ virt_pld_len = phi_pld_len;
+ else if(execWidth_org == 16)
+ virt_pld_len = (phi_pld_len + 1) / 2;
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ /* Now cl_intel_device_side_avc_motion_estimation is impelemented based on simd16 mode.
+ * So fall back to simd8 is not acceptable now.
+ * */
+ GBE_ASSERT(execWidth_org == 16);
+ /* Use MOV to Setup bits of payload: mov payload value stored in insn.src(x) to
+ * consecutive payload grf.
+ * In simd8 mode, one virtual grf register map to one physical grf register. But
+ * in simd16 mode, one virtual grf register map to two physical grf registers.
+ * So we should treat them differently.
+ * */
+ if(execWidth_org == 8){
+ for(int i=0; i < virt_pld_len; i++){
+ GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i));
+ payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+ payload_grf.width = GEN_WIDTH_1;
+ payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+ payload_grf.subphysical = 1;
+ for(int j=0; j < 8; j++){
+ payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+ GenRegister payload_val = ra->genReg(insn.src(i*8+j));
+ payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+ payload_val.width = GEN_WIDTH_1;
+ payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+ p->MOV(payload_grf, payload_val);
+ }
+ }
+ }
+ else if(execWidth_org == 16){
+ for(int i=0; i < virt_pld_len; i++){
+ int nr_num = 2;
+ if( (i == virt_pld_len-1) && (phi_pld_len%2 == 1) )
+ nr_num = 1;
+ for(int k = 0; k < nr_num; k++){
+ GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i));
+ payload_grf.nr += k;
+ payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+ payload_grf.width = GEN_WIDTH_1;
+ payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+ payload_grf.subphysical = 1;
+ for(int j=0; j < 8; j++){
+ payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+ GenRegister payload_val = ra->genReg(insn.src(i*16+k*8+j));
+ payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+ payload_val.width = GEN_WIDTH_1;
+ payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+ p->MOV(payload_grf, payload_val);
+ }
+ }
+ }
+ }
+ p->pop();
+
+#undef PHI_SIC_PAYLOAD_LEN
+#undef PHI_IME_PAYLOAD_LEN
+#undef PHI_VME_WRITEBACK_LEN
+
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ p->curr.noMask = 1;
+ p->curr.execWidth = 1;
+ GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(virt_rsp_len)), GEN_TYPE_UB);
+ payload_did.vstride = GEN_VERTICAL_STRIDE_0;
+ payload_did.width = GEN_WIDTH_1;
+ payload_did.hstride = GEN_HORIZONTAL_STRIDE_0;
+ payload_did.subphysical = 1;
+ payload_did.subnr = 20 * typeSize(GEN_TYPE_UB);
+ GenRegister grf0 = GenRegister::ub1grf(0, 20);
+ p->MOV(payload_did, grf0);
+ p->pop();
+
+ const GenRegister msgPayload = ra->genReg(insn.dst(virt_rsp_len));
+ const unsigned char bti = insn.getbti();
+ p->IME(bti, dst, msgPayload, msg_type);
+ }
+
void BxtContext::newSelection(void) {
this->sel = GBE_NEW(SelectionBxt, *this);
}
diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp
index 9977e9a..825b53c 100644
--- a/backend/src/backend/gen9_context.hpp
+++ b/backend/src/backend/gen9_context.hpp
@@ -37,6 +37,7 @@ namespace gbe
: Gen8Context(unit, name, deviceID, relaxMath) {
};
virtual void emitBarrierInstruction(const SelectionInstruction &insn);
+ virtual void emitImeInstruction(const SelectionInstruction &insn);
protected:
virtual GenEncoder* generateEncoder(void) {
diff --git a/backend/src/backend/gen9_encoder.cpp b/backend/src/backend/gen9_encoder.cpp
index b37fd98..c766232 100644
--- a/backend/src/backend/gen9_encoder.cpp
+++ b/backend/src/backend/gen9_encoder.cpp
@@ -75,6 +75,52 @@ namespace gbe
simd_mode, return_format);
}
+ void Gen9Encoder::setImeMessage(GenNativeInstruction *insn,
+ unsigned char bti,
+ uint32_t response_length,
+ uint32_t msg_length,
+ uint32_t msg_type)
+ {
+
+ GenMessageTarget sfid;
+ if(msg_type == 1 || msg_type == 3)
+ // 0Dh Check and Refinement Engine SFID_CRE SKL+ (SIC and FBR blong to SFID_CRE on SKL+)
+ sfid = GEN_SFID_CHECK_REFINE;
+ else if(msg_type == 2)
+ sfid = GEN_SFID_VIDEO_MOTION_EST;
+ setMessageDescriptor(insn, sfid, msg_length, response_length, true);
+ Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+ gen8_insn->bits3.ime_gen8.bti = bti;
+ gen8_insn->bits3.ime_gen8.msg_type = msg_type;
+ gen8_insn->bits3.ime_gen8.stream_out_enable = 0;
+ gen8_insn->bits3.ime_gen8.stream_in_enable = 0;
+ gen8_insn->bits3.ime_gen8.stream_out_enable2 = 0;
+
+ }
+
+ void Gen9Encoder::IME(unsigned char bti,
+ GenRegister dest,
+ GenRegister msg,
+ uint32_t msg_type)
+ {
+ GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3);
+ uint32_t msg_length, response_length;
+ if(msg_type == 1 || msg_type == 3){
+ msg_length = 8;
+ response_length = 7;
+ }
+ if(msg_type == 2){
+ msg_length = 6;
+ response_length = 7;
+ }
+ GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+ this->setHeader(insn);
+ this->setDst(insn, dest);
+ this->setSrc0(insn, msg);
+ this->setSrc1(insn, GenRegister::immud(0));
+ setImeMessage(insn, bti, response_length, msg_length, msg_type);
+ }
+
void Gen9Encoder::setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1)
{
assert(dst.subnr == 0 && src0.subnr == 0 && src1.subnr == 0);
diff --git a/backend/src/backend/gen9_encoder.hpp b/backend/src/backend/gen9_encoder.hpp
index 2eaa538..b862649 100644
--- a/backend/src/backend/gen9_encoder.hpp
+++ b/backend/src/backend/gen9_encoder.hpp
@@ -47,6 +47,15 @@ namespace gbe
uint32_t return_format,
bool isLD,
bool isUniform);
+ virtual void IME(unsigned char bti,
+ GenRegister dest,
+ GenRegister msg,
+ uint32_t msg_type);
+ void setImeMessage(GenNativeInstruction *insn,
+ unsigned char bti,
+ uint32_t response_length,
+ uint32_t msg_length,
+ uint32_t msg_type);
void setSendsOperands(Gen9NativeInstruction *gen9_insn, GenRegister dst, GenRegister src0, GenRegister src1);
virtual void UNTYPED_WRITE(GenRegister addr, GenRegister data, GenRegister bti, uint32_t elemNum, bool useSends);
virtual void TYPED_WRITE(GenRegister header, GenRegister data, bool header_present, unsigned char bti, bool useSends);
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 79a3e62..0b171ff 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2339,10 +2339,20 @@ namespace gbe
const unsigned int msg_type = insn.extra.msg_type;
GBE_ASSERT(msg_type == 1);
- int rsp_len;
- if(msg_type == 1)
- rsp_len = 6;
uint32_t execWidth_org = p->curr.execWidth;
+ int virt_pld_len;
+ int virt_rsp_len;
+
+#define PHI_VME_PAYLOAD_LEN 5
+#define PHI_VME_WRITEBACK_LEN 6
+
+ if(msg_type == 1){
+ virt_rsp_len = PHI_VME_WRITEBACK_LEN;
+ if(execWidth_org == 8)
+ virt_pld_len = PHI_VME_PAYLOAD_LEN;
+ else if(execWidth_org == 16)
+ virt_pld_len = (PHI_VME_PAYLOAD_LEN + 1) / 2;
+ }
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
@@ -2354,8 +2364,8 @@ namespace gbe
* So we should treat them differently.
* */
if(execWidth_org == 8){
- for(int i=0; i < 5; i++){
- GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+ for(int i=0; i < virt_pld_len; i++){
+ GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i));
payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
payload_grf.width = GEN_WIDTH_1;
payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
@@ -2372,9 +2382,12 @@ namespace gbe
}
}
else if(execWidth_org == 16){
- for(int i=0; i < 2; i++){
- for(int k = 0; k < 2; k++){
- GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+ for(int i=0; i < virt_pld_len; i++){
+ int nr_num = 2;
+ if( (i == virt_pld_len-1) && (PHI_VME_PAYLOAD_LEN%2 == 1) )
+ nr_num = 1;
+ for(int k = 0; k < nr_num; k++){
+ GenRegister payload_grf = ra->genReg(insn.dst(virt_rsp_len+i));
payload_grf.nr += k;
payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
payload_grf.width = GEN_WIDTH_1;
@@ -2391,31 +2404,16 @@ namespace gbe
}
}
}
- {
- int i = 2;
- GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
- payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
- payload_grf.width = GEN_WIDTH_1;
- payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
- payload_grf.subphysical = 1;
- for(int j=0; j < 8; j++){
- payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
- GenRegister payload_val = ra->genReg(insn.src(i*16+j));
- payload_val.vstride = GEN_VERTICAL_STRIDE_0;
- payload_val.width = GEN_WIDTH_1;
- payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
-
- p->MOV(payload_grf, payload_val);
- }
- }
}
p->pop();
+#undef PHI_VME_PAYLOAD_LEN
+#undef PHI_VME_WRITEBACK_LEN
p->push();
p->curr.predicate = GEN_PREDICATE_NONE;
p->curr.noMask = 1;
p->curr.execWidth = 1;
- GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(rsp_len)), GEN_TYPE_UB);
+ GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(virt_rsp_len)), GEN_TYPE_UB);
payload_did.vstride = GEN_VERTICAL_STRIDE_0;
payload_did.width = GEN_WIDTH_1;
payload_did.hstride = GEN_HORIZONTAL_STRIDE_0;
@@ -2425,13 +2423,17 @@ namespace gbe
p->MOV(payload_did, grf0);
p->pop();
- const GenRegister msgPayload = ra->genReg(insn.dst(rsp_len));
+ const GenRegister msgPayload = ra->genReg(insn.dst(virt_rsp_len));
const unsigned char bti = insn.getbti();
const unsigned int vme_search_path_lut = insn.extra.vme_search_path_lut;
const unsigned int lut_sub = insn.extra.lut_sub;
p->VME(bti, dst, msgPayload, msg_type, vme_search_path_lut, lut_sub);
}
+ void GenContext::emitImeInstruction(const SelectionInstruction &insn) {
+ GBE_ASSERT(0);
+ }
+
void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
p->push();
uint32_t simdWidth = p->curr.execWidth;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 7fd40d1..fa24bfe 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -178,6 +178,7 @@ namespace gbe
void emitDWordGatherInstruction(const SelectionInstruction &insn);
void emitSampleInstruction(const SelectionInstruction &insn);
void emitVmeInstruction(const SelectionInstruction &insn);
+ virtual void emitImeInstruction(const SelectionInstruction &insn);
void emitTypedWriteInstruction(const SelectionInstruction &insn);
void emitSpillRegInstruction(const SelectionInstruction &insn);
void emitUnSpillRegInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index c34e1bb..90de946 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -219,6 +219,7 @@ enum GenMessageTarget {
GEN_SFID_DATAPORT_DATA = 10,
GEN_SFID_PIXEL_INTERPOLATOR = 11,
GEN_SFID_DATAPORT1_DATA = 12, /* New for HSW and BDW. */
+ GEN_SFID_CHECK_REFINE = 13, /* New for SLK+*/
};
#define GEN_PREDICATE_NONE 0
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 217a2d8..abd0d06 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1276,6 +1276,14 @@ namespace gbe
msg_type, vme_search_path_lut, lut_sub);
}
+ void GenEncoder::IME(unsigned char bti,
+ GenRegister dest,
+ GenRegister msg,
+ uint32_t msg_type)
+ {
+ GBE_ASSERT(0);
+ }
+
void GenEncoder::TYPED_WRITE(GenRegister msg, GenRegister data, bool header_present, unsigned char bti, bool useSends)
{
GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 040b94a..fae8da1 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -231,6 +231,10 @@ namespace gbe
uint32_t msg_type,
unsigned char vme_search_path_lut,
unsigned char lut_sub);
+ virtual void IME(unsigned char bti,
+ GenRegister dest,
+ GenRegister msg,
+ uint32_t msg_type);
virtual void FLUSH_SAMPLERCACHE(GenRegister dst);
/*! TypedWrite instruction for texture */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index c75557c..d15547d 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -43,6 +43,7 @@ DECL_GEN7_SCHEDULE(PackLong, 40, 1, 1)
DECL_GEN7_SCHEDULE(UnpackLong, 40, 1, 1)
DECL_GEN7_SCHEDULE(Sample, 160, 1, 1)
DECL_GEN7_SCHEDULE(Vme, 320, 1, 1)
+DECL_GEN7_SCHEDULE(Ime, 320, 1, 1)
DECL_GEN7_SCHEDULE(TypedWrite, 80, 1, 1)
DECL_GEN7_SCHEDULE(SpillReg, 20, 1, 1)
DECL_GEN7_SCHEDULE(UnSpillReg, 160, 1, 1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 41ef7b8..9cc44de 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -193,6 +193,7 @@ namespace gbe
this->opcode == SEL_OP_BYTE_GATHERA64 ||
this->opcode == SEL_OP_SAMPLE ||
this->opcode == SEL_OP_VME ||
+ this->opcode == SEL_OP_IME ||
this->opcode == SEL_OP_DWORD_GATHER ||
this->opcode == SEL_OP_OBREAD ||
this->opcode == SEL_OP_MBREAD;
@@ -740,6 +741,7 @@ namespace gbe
void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
/*! Encode vme instructions */
void VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t vme_search_path_lut, uint32_t lut_sub);
+ void IME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type);
/*! Encode typed write instructions */
void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
/*! Get image information */
@@ -2733,6 +2735,25 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
insn->extra.lut_sub = lut_sub;
}
+ void Selection::Opaque::IME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal,
+ uint32_t dstNum, uint32_t srcNum, uint32_t msg_type) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_IME, dstNum, srcNum);
+ SelectionVector *dstVector = this->appendVector();
+
+ for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+ insn->dst(elemID) = dst[elemID];
+ for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+ insn->src(elemID) = payloadVal[elemID];
+
+ dstVector->regNum = dstNum;
+ dstVector->isSrc = 0;
+ dstVector->offsetID = 0;
+ dstVector->reg = &insn->dst(0);
+
+ insn->setbti(bti);
+ insn->extra.ime_msg_type = msg_type;
+ }
+
///////////////////////////////////////////////////////////////////////////
// Code selection public implementation
///////////////////////////////////////////////////////////////////////////
@@ -6938,6 +6959,47 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
DECL_CTOR(VmeInstruction, 1, 1);
};
+ DECL_PATTERN(ImeInstruction)
+ {
+ INLINE bool emitOne(Selection::Opaque &sel, const ir::ImeInstruction &insn, bool &markChildren) const
+ {
+ using namespace ir;
+ uint32_t msg_type;
+ msg_type = insn.getMsgType();
+ GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3);
+ uint32_t payloadLen = 0;
+ if(msg_type == 2){
+ payloadLen = 6;
+ }
+ else if(msg_type == 1 || msg_type == 3){
+ payloadLen = 8;
+ }
+ uint32_t selDstNum = insn.getDstNum() + payloadLen;
+ uint32_t srcNum = insn.getSrcNum();
+ vector<GenRegister> dst(selDstNum);
+ vector<GenRegister> payloadVal(srcNum);
+ uint32_t valueID = 0;
+ for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
+ dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
+ for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID)
+ dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+
+ for (valueID = 0; valueID < srcNum; ++valueID)
+ payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+
+ uint32_t bti = insn.getImageIndex() + BTI_WORKAROUND_IMAGE_OFFSET;
+ if (bti > BTI_MAX_ID) {
+ std::cerr << "Too large bti " << bti;
+ return false;
+ }
+
+ sel.IME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, msg_type);
+
+ return true;
+ }
+ DECL_CTOR(ImeInstruction, 1, 1);
+ };
+
/*! Typed write instruction pattern. */
DECL_PATTERN(TypedWriteInstruction)
{
@@ -8093,6 +8155,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
this->insert<SelectModifierInstructionPattern>();
this->insert<SampleInstructionPattern>();
this->insert<VmeInstructionPattern>();
+ this->insert<ImeInstructionPattern>();
this->insert<GetImageInfoInstructionPattern>();
this->insert<ReadARFInstructionPattern>();
this->insert<RegionInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 0eeaa5d..47bacfc 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -96,8 +96,8 @@ namespace gbe
const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
/*! Set debug infomation to selection */
void setDBGInfo(DebugInfo in) { DBGInfo = in; }
- /*! No more than 40 sources (40 sources are used by vme for payload passing and setting) */
- enum { MAX_SRC_NUM = 40 };
+ /*! No more than 64 sources (48 sources are used by vme for payload passing and setting) */
+ enum { MAX_SRC_NUM = 64 };
/*! No more than 17 destinations (17 used by image block read8) */
enum { MAX_DST_NUM = 17 };
/*! State of the instruction (extra fields neeed for the encoding) */
@@ -143,6 +143,10 @@ namespace gbe
uint16_t vme_search_path_lut:3;
uint16_t lut_sub:2;
};
+ struct {
+ uint16_t ime_bti:8;
+ uint16_t ime_msg_type:2;
+ };
uint32_t barrierType;
uint32_t waitType;
bool longjmp;
@@ -172,7 +176,7 @@ namespace gbe
/*! Number of destinations */
uint8_t dstNum:5;
/*! Number of sources */
- uint8_t srcNum:6;
+ uint8_t srcNum:7;
/*! To store various indices */
uint32_t index;
/*! For BRC/IF to store the UIP */
@@ -192,6 +196,7 @@ namespace gbe
case SEL_OP_DWORD_GATHER: return extra.function;
case SEL_OP_SAMPLE: return extra.rdbti;
case SEL_OP_VME: return extra.vme_bti;
+ case SEL_OP_IME: return extra.ime_bti;
case SEL_OP_TYPED_WRITE: return extra.bti;
default:
GBE_ASSERT(0);
@@ -209,6 +214,7 @@ namespace gbe
case SEL_OP_DWORD_GATHER: extra.function = bti; return;
case SEL_OP_SAMPLE: extra.rdbti = bti; return;
case SEL_OP_VME: extra.vme_bti = bti; return;
+ case SEL_OP_IME: extra.ime_bti = bti; return;
case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
default:
GBE_ASSERT(0);
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 5d96e9e..24dd040 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -72,6 +72,7 @@ DECL_SELECTION_IR(PACK_LONG, PackLongInstruction)
DECL_SELECTION_IR(UNPACK_LONG, UnpackLongInstruction)
DECL_SELECTION_IR(SAMPLE, SampleInstruction)
DECL_SELECTION_IR(VME, VmeInstruction)
+DECL_SELECTION_IR(IME, ImeInstruction)
DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index a9156ff..fd60eb8 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -682,6 +682,50 @@ namespace ir {
uint32_t dstNum;
};
+ class ALIGNED_INSTRUCTION ImeInstruction :
+ public BasePolicy,
+ public TupleSrcPolicy<ImeInstruction>,
+ public TupleDstPolicy<ImeInstruction>
+ {
+ public:
+ ImeInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple,
+ uint32_t dstNum, uint32_t srcNum, int msg_type) {
+ this->opcode = OP_IME;
+ this->dst = dstTuple;
+ this->src = srcTuple;
+ this->dstNum = dstNum;
+ this->srcNum = srcNum;
+ this->imageIdx = imageIdx;
+ this->msg_type = msg_type;
+ }
+ INLINE bool wellFormed(const Function &fn, std::string &why) const;
+ INLINE void out(std::ostream &out, const Function &fn) const {
+ this->outOpcode(out);
+ out << " src_surface id " << (int)this->getImageIndex()
+ << " ref_surface id " << (int)this->getImageIndex() + 1;
+ for(uint32_t i = 0; i < dstNum; i++){
+ out<< " %" << this->getDst(fn, i);
+ }
+ for(uint32_t i = 0; i < srcNum; i++){
+ out<< " %" << this->getSrc(fn, i);
+ }
+ out
+ << " msg_type " << (int)this->getMsgType();
+ }
+ Tuple src;
+ Tuple dst;
+
+ INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+ INLINE uint8_t getMsgType(void) const { return this->msg_type; }
+
+ INLINE Type getSrcType(void) const { return TYPE_U32; }
+ INLINE Type getDstType(void) const { return TYPE_U32; }
+ uint8_t imageIdx;
+ uint8_t msg_type;
+ uint32_t srcNum;
+ uint32_t dstNum;
+ };
+
class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO
public BasePolicy,
@@ -1451,6 +1495,8 @@ namespace ir {
{ return true; }
INLINE bool VmeInstruction::wellFormed(const Function &fn, std::string &why) const
{ return true; }
+ INLINE bool ImeInstruction::wellFormed(const Function &fn, std::string &why) const
+ { return true; }
INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
{ return true; }
INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
@@ -2179,6 +2225,9 @@ END_INTROSPECTION(WaitInstruction)
START_INTROSPECTION(VmeInstruction)
#include "ir/instruction.hxx"
END_INTROSPECTION(VmeInstruction)
+START_INTROSPECTION(ImeInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(ImeInstruction)
START_INTROSPECTION(WorkGroupInstruction)
#include "ir/instruction.hxx"
@@ -2401,6 +2450,10 @@ DECL_MEM_FN(VmeInstruction, Type, getSrcType(void), getSrcType())
DECL_MEM_FN(VmeInstruction, Type, getDstType(void), getDstType())
DECL_MEM_FN(VmeInstruction, uint8_t, getImageIndex(void), getImageIndex())
DECL_MEM_FN(VmeInstruction, uint8_t, getMsgType(void), getMsgType())
+DECL_MEM_FN(ImeInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(ImeInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(ImeInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(ImeInstruction, uint8_t, getMsgType(void), getMsgType())
DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
@@ -2702,6 +2755,9 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg))
Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub) {
return internal::VmeInstruction(imageIndex, dst, src, dstNum, srcNum, msg_type, vme_search_path_lut, lut_sub).convert();
}
+ Instruction IME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type) {
+ return internal::ImeInstruction(imageIndex, dst, src, dstNum, srcNum, msg_type).convert();
+ }
Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType) {
return internal::TypedWriteInstruction(imageIndex, src, srcNum, srcType, coordType).convert();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 8685dd4..a93204f 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -217,7 +217,8 @@ namespace ir {
return T::isClassOf(*this);
}
/*! max_src used by vme for payload passing and setting */
- static const uint32_t MAX_SRC_NUM = 40;
+ //static const uint32_t MAX_SRC_NUM = 48;
+ static const uint32_t MAX_SRC_NUM = 64;
static const uint32_t MAX_DST_NUM = 32;
DebugInfo DBGInfo;
protected:
@@ -428,6 +429,16 @@ namespace ir {
static bool isClassOf(const Instruction &insn);
};
+ class ImeInstruction : public Instruction {
+ public:
+ uint8_t getImageIndex() const;
+ uint8_t getMsgType() const;
+ Type getSrcType(void) const;
+ Type getDstType(void) const;
+ /*! Return true if the given instruction is an instance of this class */
+ static bool isClassOf(const Instruction &insn);
+ };
+
typedef union _ImageInfoKey{
_ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
_ImageInfoKey(int key) : data(key) {};
@@ -879,6 +890,7 @@ namespace ir {
Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
/*! video motion estimation */
Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub);
+ Instruction IME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type);
/*! get image information , such as width/height/depth/... */
Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
/*! label labelIndex */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 81618eb..2054b9c 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -86,6 +86,7 @@ DECL_INSN(LABEL, LabelInstruction)
DECL_INSN(READ_ARF, ReadARFInstruction)
DECL_INSN(REGION, RegionInstruction)
DECL_INSN(VME, VmeInstruction)
+DECL_INSN(IME, ImeInstruction)
DECL_INSN(INDIRECT_MOV, IndirectMovInstruction)
DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
DECL_INSN(MUL_HI, BinaryInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index dbb5c33..1d385ee 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -142,6 +142,7 @@ namespace ir {
opCode != ir::OP_RHADD &&
opCode != ir::OP_READ_ARF &&
opCode != ir::OP_ADDSAT &&
+ opCode != ir::OP_IME &&
(dstNum == 1 || insn.getOpcode() != ir::OP_LOAD) &&
!extentRegs->contains(reg)
)
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index d5fa589..cb9e5bd 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -19,6 +19,10 @@
#define __OCL_MISC_H__
#include "ocl_types.h"
+#include "ocl_workitem.h"
+#include "ocl_simd.h"
+#include "ocl_printf.h"
+#include "ocl_as.h"
#define DEC2(TYPE, XTYPE, MASKTYPE) \
OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask);
@@ -138,6 +142,232 @@ struct time_stamp {
uint event;
};
+//Interlaced image field polarity values:
+#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
+#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
+
+//Inter macro-block major shape values:
+#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
+#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
+#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
+#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
+
+//Inter macro-block minor shape values:
+#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
+#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
+#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
+#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
+
+//Inter macro-block major direction values:
+#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
+#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
+#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
+
+//Inter (IME) partition mask values:
+#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
+#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
+#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
+#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
+#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
+#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
+#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
+#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
+
+//Slice type values:
+#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
+#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
+#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
+
+//Search window configuration:
+#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
+#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
+#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
+#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
+#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
+#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
+#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
+#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
+
+//SAD adjustment mode:
+#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
+#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
+
+//Pixel resolution:
+#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
+#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
+#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
+
+//Cost precision values:
+#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
+#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
+#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
+#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
+
+//Inter bidirectional weights:
+#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
+#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
+#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
+#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
+#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
+
+//Inter border reached values:
+#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
+#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
+#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
+#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
+
+//Intra macro-block shape values:
+#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
+#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
+#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
+
+//Inter skip block partition type:
+#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
+#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x04000
+
+//Inter skip motion vector mask:
+#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ ENABLE_INTEL (0x2<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2<<24)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1<<26)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2<<26)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1<<28)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2<<28)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1<<30)
+#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2<<30)
+
+//Block based skip type values:
+#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x0
+#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
+
+//Luma intra partition mask values:
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
+
+//Intra neighbor availability mask values:
+#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
+
+//Luma intra modes:
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
+
+//Chroma intra modes:
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
+
+//Reference image select values:
+#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
+#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
+#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
+
+//VME media sampler initialization value:
+#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
+
+//Default IME payload initialization:
+#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL {0x0}
+
+//Default REF payload initialization:
+#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL {0x0}
+
+//Default SIC payload initialization:
+#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL {0x0}
+
+//Default IME result initialization:
+#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL {0x0}
+
+//Default REF result initialization:
+#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL {0x0}
+
+//Default SIC result initialization:
+#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL {0x0}
+
+typedef struct{
+ ushort2 srcCoord;
+ short2 ref_offset;
+ uchar partition_mask;
+ uchar sad_adjustment;
+ uchar search_window_config;
+ ulong cc0;
+ ulong cc1;
+ ulong cc2;
+ ulong cc3;
+ uint2 packed_cost_table;
+ uchar cost_precision;
+ ulong packed_shape_cost;
+}intel_sub_group_avc_ime_payload_t;
+
+typedef uint8 intel_sub_group_avc_ime_result_t;
+
+#define REF_ENABLE_COST_PENALTY 1
+
+typedef struct{
+ ushort2 srcCoord;
+ long mv;
+ uchar major_shape;
+ uchar minor_shapes;
+ uchar directions;
+ uchar pixel_mode;
+ uchar sad_adjustment;
+#if REF_ENABLE_COST_PENALTY
+ ulong cc0;
+ ulong cc1;
+ ulong cc2;
+ ulong cc3;
+ uint2 packed_cost_table;
+ uchar cost_precision;
+ ulong packed_shape_cost;
+#endif
+}intel_sub_group_avc_ref_payload_t;
+
+typedef struct{
+ ushort2 srcCoord;
+ uint skip_block_partition_type;
+ uint skip_motion_vector_mask;
+ char bidirectional_weight;
+ uchar skip_sad_adjustment;
+ long mv;
+
+ uchar luma_intra_partition_mask;
+ uchar intra_neighbour_availabilty;
+ uint l_0_3;
+ uint l_4_7;
+ uint l_8_11;
+ uint l_12_15;
+ uint u_0_3;
+ uint u_4_7;
+ uint u_8_11;
+ uint u_12_15;
+ uint ur_16_19;
+ uint ur_20_23;
+ uchar upper_left_corner_luma_pixel;
+ uchar intra_sad_adjustment;
+ uint intra_shape_cost;
+}intel_sub_group_avc_sic_payload_t;
+
+typedef uint8 intel_sub_group_avc_ref_result_t;
+
+typedef uint8 intel_sub_group_avc_sic_result_t;
+
uint __gen_ocl_region(ushort offset, uint data);
struct time_stamp __gen_ocl_get_timestamp(void);
@@ -155,6 +385,140 @@ uint8 __gen_ocl_vme(image2d_t, image2d_t,
uint, uint, uint, uint,
int, int, int);
+intel_sub_group_avc_ime_result_t
+__gen_ocl_ime(image2d_t, image2d_t,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ uint, uint, uint, uint,
+ int);
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_initialize(ushort2 src_coord,
+ uchar partition_mask,
+ uchar sad_adjustment);
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_single_reference(short2 ref_offset,
+ uchar search_window_config,
+ intel_sub_group_avc_ime_payload_t payload);
+
+intel_sub_group_avc_ime_result_t
+intel_sub_group_avc_ime_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_ime_payload_t payload);
+
+ulong intel_sub_group_avc_ime_get_motion_vectors(intel_sub_group_avc_ime_result_t result);
+
+ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result);
+
+ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result);
+
+uchar intel_sub_group_avc_ime_get_inter_major_shape(intel_sub_group_avc_ime_result_t result);
+
+uchar intel_sub_group_avc_ime_get_inter_minor_shapes(intel_sub_group_avc_ime_result_t result);
+
+uchar intel_sub_group_avc_ime_get_inter_directions(intel_sub_group_avc_ime_result_t result);
+
+intel_sub_group_avc_ref_payload_t
+intel_sub_group_avc_fme_initialize(ushort2 src_coord,
+ ulong motion_vectors,
+ uchar major_shapes,
+ uchar minor_shapes,
+ uchar directions,
+ uchar pixel_resolution,
+ uchar sad_adjustment );
+
+intel_sub_group_avc_ref_result_t
+intel_sub_group_avc_ref_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_ref_payload_t payload);
+
+ulong intel_sub_group_avc_ref_get_motion_vectors(intel_sub_group_avc_ref_result_t result);
+
+ushort intel_sub_group_avc_ref_get_inter_distortions(intel_sub_group_avc_ref_result_t result);
+
+uint2 intel_sub_group_avc_mce_get_default_medium_penalty_cost_table(void);
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_motion_vector_cost_function(ulong packed_cost_center_delta,
+ uint2 packed_cost_table,
+ uchar cost_precision,
+ intel_sub_group_avc_ime_payload_t payload);
+
+#if REF_ENABLE_COST_PENALTY
+intel_sub_group_avc_ref_payload_t
+intel_sub_group_avc_ref_set_motion_vector_cost_function(ulong packed_cost_center_delta,
+ uint2 packed_cost_table,
+ uchar cost_precision,
+ intel_sub_group_avc_ref_payload_t payload);
+#endif
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_inter_shape_penalty(ulong packed_shape_cost,
+ intel_sub_group_avc_ime_payload_t payload);
+
+intel_sub_group_avc_sic_result_t
+intel_sub_group_avc_sic_evaluate_ipe(read_only image2d_t src_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_sic_payload_t payload);
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_initialize(ushort2 src_coord );
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_configure_ipe(uchar luma_intra_partition_mask,
+ uchar intra_neighbour_availabilty,
+ uchar left_edge_luma_pixels,
+ uchar upper_left_corner_luma_pixel,
+ uchar upper_edge_luma_pixels,
+ uchar upper_right_edge_luma_pixels,
+ uchar intra_sad_adjustment,
+ intel_sub_group_avc_sic_payload_t payload );
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_set_intra_luma_shape_penalty(uint packed_shape_cost,
+ intel_sub_group_avc_sic_payload_t payload );
+
+uchar
+intel_sub_group_avc_sic_get_ipe_luma_shape(intel_sub_group_avc_sic_result_t result);
+
+ushort
+intel_sub_group_avc_sic_get_best_ipe_luma_distortion(intel_sub_group_avc_sic_result_t result);
+
+ulong intel_sub_group_avc_sic_get_packed_ipe_luma_modes(intel_sub_group_avc_sic_result_t result);
+
+
+intel_sub_group_avc_sic_result_t
+intel_sub_group_avc_sic_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_sic_payload_t payload);
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_configure_skc(uint skip_block_partition_type,
+ uint skip_motion_vector_mask,
+ ulong motion_vectors,
+ char bidirectional_weight,
+ uchar skip_sad_adjustment,
+ intel_sub_group_avc_sic_payload_t payload);
+
+ushort
+intel_sub_group_avc_sic_get_inter_distortions(intel_sub_group_avc_sic_result_t result);
+
bool __gen_ocl_in_local(size_t p);
bool __gen_ocl_in_private(size_t p);
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index bfa2fa7..f7710d9 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -232,6 +232,1331 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
return val;
};
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_initialize(ushort2 src_coord,
+ uchar partition_mask,
+ uchar sad_adjustment){
+ intel_sub_group_avc_ime_payload_t pl;
+ pl.srcCoord = src_coord;
+ pl.partition_mask = partition_mask;
+ pl.sad_adjustment = sad_adjustment;
+ pl.ref_offset = (0, 0);
+ pl.search_window_config = 0;
+ pl.cc0 = 0;
+ pl.cc1 = 0;
+ pl.cc2 = 0;
+ pl.cc3 = 0;
+ pl.packed_cost_table = (0, 0);
+ pl.cost_precision = 2;
+ pl.packed_shape_cost = 0;
+ return pl;
+}
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_single_reference(short2 ref_offset,
+ uchar search_window_config,
+ intel_sub_group_avc_ime_payload_t payload){
+ intel_sub_group_avc_ime_payload_t pl = payload;
+ pl.ref_offset = ref_offset;
+ pl.search_window_config = search_window_config;
+ return pl;
+}
+
+intel_sub_group_avc_ime_result_t
+intel_sub_group_avc_ime_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_ime_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+
+ short2 predict_mv = payload.ref_offset;
+ //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (payload.partition_mask << 24) | (0 << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (payload.sad_adjustment << 20)| (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x);
+
+ /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/
+ src_grf1_dw7 = (payload.cost_precision << 16);
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = Reseverd for BDW+
+ src_grf1_dw4 = Reseverd for BDW+*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15
+ src_grf1_dw3 = 0;
+ //XXX: should set src_grf1_dw2
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ | (0 << 16) | (2 << 8) | (2);
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = (0 << 24) | (16);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+
+ //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6]
+ src_grf2_dw7 = 0;
+ //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2]
+ src_grf2_dw6 = 0;
+ //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input);
+ src_grf2_dw5 = 0;
+ //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost;
+ src_grf2_dw4 = payload.packed_cost_table.s1;
+ //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost;
+ src_grf2_dw3 = payload.packed_cost_table.s0;
+ //src_grf2_dw2 = ... Mode 8 Cost;
+ src_grf2_dw2 = (payload.packed_shape_cost >> 32) & 0x000000ff;
+ //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost
+ src_grf2_dw1 = payload.packed_shape_cost;
+ src_grf2_dw0 = 0;
+ //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ;
+ src_grf3_dw7 = payload.cc3 >> 32;
+ //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ;
+ src_grf3_dw6 = payload.cc3;
+ //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ;
+ src_grf3_dw5 = payload.cc2 >> 32;
+ //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ;
+ src_grf3_dw4 = payload.cc2;
+ //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ;
+ src_grf3_dw3 = payload.cc1 >> 32;
+ //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ;
+ src_grf3_dw2 = payload.cc1;
+ //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ;
+ src_grf3_dw1 = payload.cc0 >> 32;
+ //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ;
+ src_grf3_dw0 = payload.cc0;
+
+ //XXX: TODO: set search path
+ src_grf4_dw7 = 0;
+ src_grf4_dw6 = 0;
+ src_grf4_dw5 = 0;
+ src_grf4_dw4 = 0;
+ src_grf4_dw3 = 0;
+ src_grf4_dw2 = 0;
+ src_grf4_dw1 = 0;
+ src_grf4_dw0 = 0;
+ src_grf5_dw7 = 0;
+ src_grf5_dw6 = 0;
+ src_grf5_dw5 = 0;
+ src_grf5_dw4 = 0;
+ src_grf5_dw3 = 0;
+ src_grf5_dw2 = 0;
+ src_grf5_dw1 = 0;
+ src_grf5_dw0 = 0;
+
+ intel_sub_group_avc_ime_result_t ime_result;
+ ime_result = __gen_ocl_ime(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 2);
+
+ return ime_result;
+}
+
+ulong intel_sub_group_avc_ime_get_motion_vectors(intel_sub_group_avc_ime_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint fwd_mv, bwd_mv;
+ if(lid_x < 4){
+ fwd_mv = intel_sub_group_shuffle(result.s0, 8 + lid_x*2);
+ bwd_mv = intel_sub_group_shuffle(result.s0, 9 + lid_x*2);
+ }
+ else if(lid_x >= 4 && lid_x <= 12){
+ fwd_mv = intel_sub_group_shuffle(result.s1, 0 + (lid_x-4)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s1, 1 + (lid_x-4)*2);
+ }
+ else if(lid_x < 16){
+ fwd_mv = intel_sub_group_shuffle(result.s2, 0 + (lid_x-12)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s2, 1 + (lid_x-12)*2);
+ }
+
+ ulong res = (bwd_mv << 32) | (fwd_mv & 0x00000000ffffffff);
+ return res;
+}
+
+ushort intel_sub_group_avc_ime_get_inter_distortions(intel_sub_group_avc_ime_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2);
+ int start_bit = lid_x%2 * 16;
+ ushort distortion = (write_back_dw >> start_bit);
+ return distortion;
+}
+
+uchar intel_sub_group_avc_ime_get_inter_major_shape(intel_sub_group_avc_ime_result_t result){
+ uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0);
+ uchar major_shape = write_back_dw00 & 0x03;
+ return major_shape;
+}
+
+uchar intel_sub_group_avc_ime_get_inter_minor_shapes(intel_sub_group_avc_ime_result_t result){
+ uint write_back_dw06 = intel_sub_group_shuffle(result.s0, 6);
+ uchar minor_shape = (write_back_dw06 >> 8) & 0xff;
+ return minor_shape;
+}
+
+uchar intel_sub_group_avc_ime_get_inter_directions(intel_sub_group_avc_ime_result_t result){
+ uint write_back_dw06 = intel_sub_group_shuffle(result.s0, 6);
+ uchar direction = (write_back_dw06 >> 16) & 0xff;
+ return direction;
+}
+
+intel_sub_group_avc_ref_payload_t
+intel_sub_group_avc_fme_initialize(ushort2 src_coord,
+ ulong motion_vectors,
+ uchar major_shapes,
+ uchar minor_shapes,
+ uchar directions,
+ uchar pixel_resolution,
+ uchar sad_adjustment ){
+ intel_sub_group_avc_ref_payload_t pl;
+ pl.srcCoord = src_coord;
+ pl.mv = motion_vectors;
+ pl.major_shape = major_shapes;
+ pl.minor_shapes = minor_shapes;
+ pl.directions = directions;
+ pl.pixel_mode = pixel_resolution;
+ pl.sad_adjustment = sad_adjustment;
+#if REF_ENABLE_COST_PENALTY
+ pl.cc0 = 0;
+ pl.cc1 = 0;
+ pl.cc2 = 0;
+ pl.cc3 = 0;
+ pl.packed_cost_table = (0, 0);
+ pl.cost_precision = 2;
+ pl.packed_shape_cost = 0;
+#endif
+ return pl;
+}
+
+intel_sub_group_avc_ref_result_t
+intel_sub_group_avc_ref_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_ref_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0 << 24) | (0 << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (payload.sad_adjustment << 20)| (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (payload.pixel_mode << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0;
+
+
+ /*src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ | (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);*/
+ src_grf1_dw7 = 0;
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = Reseverd for BDW+
+ src_grf1_dw4 = Reseverd for BDW+*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ src_grf1_dw2 = 0;
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ //src_grf1_dw1 = (0 << 24) | (2);
+ src_grf1_dw1 = (0 << 24) | (16);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+
+ //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6]
+ src_grf2_dw7 = 0;
+ //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2]
+ src_grf2_dw6 = 0;
+ //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input);
+ src_grf2_dw5 = (0 << 24) | (payload.directions << 16) | (payload.minor_shapes << 8) | (payload.major_shape);
+#if REF_ENABLE_COST_PENALTY
+ //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost;
+ src_grf2_dw4 = payload.packed_cost_table.s1;
+ //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost;
+ src_grf2_dw3 = payload.packed_cost_table.s0;
+ //src_grf2_dw2 = ... Mode 8 Cost;
+ src_grf2_dw2 = (payload.packed_shape_cost >> 32) & 0x000000ff;
+ //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost
+ src_grf2_dw1 = payload.packed_shape_cost;
+ src_grf2_dw0 = 0;
+ //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ;
+ src_grf3_dw7 = payload.cc3 >> 32;
+ //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ;
+ src_grf3_dw6 = payload.cc3;
+ //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ;
+ src_grf3_dw5 = payload.cc2 >> 32;
+ //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ;
+ src_grf3_dw4 = payload.cc2;
+ //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ;
+ src_grf3_dw3 = payload.cc1 >> 32;
+ //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ;
+ src_grf3_dw2 = payload.cc1;
+ //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ;
+ src_grf3_dw1 = payload.cc0 >> 32;
+ //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ;
+ src_grf3_dw0 = payload.cc0;
+#else
+ src_grf2_dw4 = 0;
+ src_grf2_dw3 = 0;
+ src_grf2_dw2 = 0;
+ src_grf2_dw1 = 0;
+ src_grf2_dw0 = 0;
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+#endif
+
+ //grf4...grf7 = Ref0/1 Sub-block XY 0...15
+ int2 bi_mv_temp = as_int2( payload.mv );
+ int2 bi_mv = intel_sub_group_shuffle(bi_mv_temp, 3);
+ src_grf4_dw7 = bi_mv.s1;
+ src_grf4_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 2);
+ src_grf4_dw5 = bi_mv.s1;
+ src_grf4_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 1);
+ src_grf4_dw3 = bi_mv.s1;
+ src_grf4_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 0);
+ src_grf4_dw1 = bi_mv.s1;
+ src_grf4_dw0 = bi_mv.s0;
+
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 7);
+ src_grf5_dw7 = bi_mv.s1;
+ src_grf5_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 6);
+ src_grf5_dw5 = bi_mv.s1;
+ src_grf5_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 5);
+ src_grf5_dw3 = bi_mv.s1;
+ src_grf5_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 4);
+ src_grf5_dw1 = bi_mv.s1;
+ src_grf5_dw0 = bi_mv.s0;
+
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 11);
+ src_grf6_dw7 = bi_mv.s1;
+ src_grf6_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 10);
+ src_grf6_dw5 = bi_mv.s1;
+ src_grf6_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 9);
+ src_grf6_dw3 = bi_mv.s1;
+ src_grf6_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 8);
+ src_grf6_dw1 = bi_mv.s1;
+ src_grf6_dw0 = bi_mv.s0;
+
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 15);
+ src_grf7_dw7 = bi_mv.s1;
+ src_grf7_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 14);
+ src_grf7_dw5 = bi_mv.s1;
+ src_grf7_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 13);
+ src_grf7_dw3 = bi_mv.s1;
+ src_grf7_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 12);
+ src_grf7_dw1 = bi_mv.s1;
+ src_grf7_dw0 = bi_mv.s0;
+
+ intel_sub_group_avc_ref_result_t ref_result;
+ ref_result = __gen_ocl_ime(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 3);
+
+ return ref_result;
+}
+
+ulong intel_sub_group_avc_ref_get_motion_vectors(intel_sub_group_avc_ref_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint fwd_mv, bwd_mv;
+ if(lid_x < 4){
+ fwd_mv = intel_sub_group_shuffle(result.s0, 8 + lid_x*2);
+ bwd_mv = intel_sub_group_shuffle(result.s0, 9 + lid_x*2);
+ }
+ else if(lid_x >= 4 && lid_x <= 12){
+ fwd_mv = intel_sub_group_shuffle(result.s1, 0 + (lid_x-4)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s1, 1 + (lid_x-4)*2);
+ }
+ else if(lid_x < 16){
+ fwd_mv = intel_sub_group_shuffle(result.s2, 0 + (lid_x-12)*2);
+ bwd_mv = intel_sub_group_shuffle(result.s2, 1 + (lid_x-12)*2);
+ }
+
+ ulong res = (bwd_mv << 32) | (fwd_mv & 0x00000000ffffffff);
+ return res;
+}
+
+ushort intel_sub_group_avc_ref_get_inter_distortions(intel_sub_group_avc_ref_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2);
+ int start_bit = lid_x%2 * 16;
+ ushort distortion = (write_back_dw >> start_bit);
+ return distortion;
+}
+
+uint2 intel_sub_group_avc_mce_get_default_medium_penalty_cost_table(void){
+ #define COST_PENALTY(idx, base, shift) \
+ uchar cost_penalty_##idx = (shift << 4) | (base);
+
+ COST_PENALTY(0, 1, 0)
+ COST_PENALTY(1, 1, 0)
+ COST_PENALTY(2, 1, 0)
+ COST_PENALTY(3, 1, 0)
+ COST_PENALTY(4, 1, 0)
+ COST_PENALTY(5, 1, 0)
+ COST_PENALTY(6, 1, 0)
+ COST_PENALTY(7, 1, 0)
+ uint2 cost_table;
+ cost_table.s0 = cost_penalty_0 | (cost_penalty_1 << 8) | ( cost_penalty_2 << 16) | (cost_penalty_3 << 24);
+ cost_table.s1 = cost_penalty_4 | (cost_penalty_5 << 8) | ( cost_penalty_6 << 16) | (cost_penalty_7 << 24);
+ return cost_table;
+}
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_motion_vector_cost_function(ulong packed_cost_center_delta,
+ uint2 packed_cost_table,
+ uchar cost_precision,
+ intel_sub_group_avc_ime_payload_t payload){
+ intel_sub_group_avc_ime_payload_t pl = payload;
+ pl.packed_cost_table = packed_cost_table;
+ pl.cost_precision = cost_precision;
+
+ uint lid_x = get_sub_group_local_id();
+ if(lid_x == 0)
+ pl.cc0 = packed_cost_center_delta;
+ else if(lid_x == 1)
+ pl.cc1 = packed_cost_center_delta;
+ else if(lid_x == 2)
+ pl.cc2 = packed_cost_center_delta;
+ else if(lid_x == 3)
+ pl.cc3 = packed_cost_center_delta;
+ else{
+ }
+ return pl;
+}
+
+#if REF_ENABLE_COST_PENALTY
+intel_sub_group_avc_ref_payload_t
+intel_sub_group_avc_ref_set_motion_vector_cost_function(ulong packed_cost_center_delta,
+ uint2 packed_cost_table,
+ uchar cost_precision,
+ intel_sub_group_avc_ref_payload_t payload){
+ intel_sub_group_avc_ref_payload_t pl = payload;
+ pl.packed_cost_table = packed_cost_table;
+ pl.cost_precision = cost_precision;
+
+ uint lid_x = get_sub_group_local_id();
+ if(lid_x == 0)
+ pl.cc0 = packed_cost_center_delta;
+ else if(lid_x == 1)
+ pl.cc1 = packed_cost_center_delta;
+ else if(lid_x == 2)
+ pl.cc2 = packed_cost_center_delta;
+ else if(lid_x == 3)
+ pl.cc3 = packed_cost_center_delta;
+ else{
+ }
+ return pl;
+}
+
+#endif
+
+intel_sub_group_avc_ime_payload_t
+intel_sub_group_avc_ime_set_inter_shape_penalty(ulong packed_shape_cost,
+ intel_sub_group_avc_ime_payload_t payload){
+ intel_sub_group_avc_ime_payload_t pl = payload;
+ pl.packed_shape_cost = packed_shape_cost;
+ return pl;
+}
+
+intel_sub_group_avc_sic_result_t
+intel_sub_group_avc_sic_evaluate_ipe(read_only image2d_t src_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_sic_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0 << 24) | (payload.intra_sad_adjustment << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y<<16) | (payload.srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0;
+
+ //src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ src_grf1_dw7 = (0 << 24) | (0 << 22) | (0 << 21)
+ //| (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (0 << 16) | (payload.intra_neighbour_availabilty << 8) | (0 << 7)
+ //| (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);
+ | (0 << 6) | (0 << 5) | (payload.luma_intra_partition_mask);
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = Reseverd for BDW+
+ src_grf1_dw4 = Reseverd for BDW+*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ //src_grf1_dw3 = Weighted SAD Control Sub-block 0...15
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ src_grf1_dw2 = 0;
+
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = 0;
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 0;
+
+ //cost related
+ src_grf2_dw7 = 0;
+ src_grf2_dw6 = 0;
+ src_grf2_dw5 = 0;
+ src_grf2_dw4 = 0;
+ src_grf2_dw3 = 0;
+ src_grf2_dw2 = 0;
+ src_grf2_dw1 = 0;
+ //src_grf2_dw0 = (MODE_INTRA_4x4 << 24) | (MODE_INTRA_8x8 << 16) | (MODE_INTRA_16x16 << 8) | (MODE_INTRA_NONPRED);
+ src_grf2_dw0 = payload.intra_shape_cost;
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+
+ //Ref* SkipCenter* Delta XY
+ /*src_grf4_dw7 = Ref1_SkipCenter_3_Delta_XY;
+ src_grf4_dw6 = Ref0_SkipCenter_3_Delta_XY;
+ src_grf4_dw5 = Ref1_SkipCenter_2_Delta_XY;
+ src_grf4_dw4 = Ref0_SkipCenter_3_Delta_XY;
+ src_grf4_dw3 = Ref1_SkipCenter_1_Delta_XY;
+ src_grf4_dw2 = Ref0_SkipCenter_1_Delta_XY;
+ src_grf4_dw1 = Ref1_SkipCenter_0_Delta_XY;
+ src_grf4_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16) | (Ref0_Skip_Center_0_Delta_X);*/
+ src_grf4_dw7 = 0;
+ src_grf4_dw6 = 0;
+ src_grf4_dw5 = 0;
+ src_grf4_dw4 = 0;
+ src_grf4_dw3 = 0;
+ src_grf4_dw2 = 0;
+ src_grf4_dw1 = 0;
+ src_grf4_dw0 = 0;
+
+ //src_grf5_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+ src_grf5_dw7 = payload.ur_20_23;
+ //src_grf5_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+ src_grf5_dw6 = payload.ur_16_19;
+ //src_grf5_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+ src_grf5_dw5 = payload.u_12_15;
+ //src_grf5_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+ src_grf5_dw4 = payload.u_8_11;
+ //src_grf5_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+ src_grf5_dw3 = payload.u_4_7;
+ //src_grf5_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16)
+ //| (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]);
+ src_grf5_dw2 = payload.u_0_3;
+ uchar mode_mask_16_16 = 0xf;
+ ushort mode_mask_8_8 = 0x01ff, mode_mask_4_4 = 0x01ff;
+ if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL){
+ mode_mask_16_16 = 0;
+ mode_mask_8_8 = 0;
+ mode_mask_4_4 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL){
+ mode_mask_16_16 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL){
+ mode_mask_8_8 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL){
+ mode_mask_4_4 = 0;
+ }
+ //src_grf5_dw1 = (Corner_Neighbor_pixel_0 << 24) | (Reserved << 10) | (IntraComputeType << 8)
+ //| (IntraChromaModeMask << 4) | (Intra16x16ModeMask);
+ src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (mode_mask_16_16);
+ //src_grf5_dw0 = (Reserved<<25) | (Intra_8x8_Mode_Mask << 16) | (Reserved<<9) | (Intra_4x4_Mode_Mask);
+ src_grf5_dw0 = (0<<25) | (mode_mask_8_8 << 16) | (0<<9) | (mode_mask_4_4);
+ //src_grf6_dw7 = (Reserved << 24) | (Penalty_4x4_non_DC << 16) | (Penalty_8x8_non_DC << 8) | (Penalty_16x16_non_DC);
+ src_grf6_dw7 = 0;
+ //src_grf6_dw6 = Reserved;
+ src_grf6_dw6 = 0;
+ //src_grf6_dw5 = (Reserved << 16) | (Neighbor pixel Chroma value CbCr pair [-1, -1]);
+ src_grf6_dw5 = 0;
+ //src_grf6_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20)
+ //| (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8)
+ //| (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5);
+ //XXX: Which value should be set to?
+ src_grf6_dw4 = (2 << 28) | (2 << 24) | (2 << 20)
+ | (2 << 16) | (2 << 12) | (2 << 8)
+ | (2 << 4) | (2);
+ //src_grf6_dw3 = (Corner_Neighbor_pixel_1 << 24) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+ src_grf6_dw3 = payload.l_12_15;
+ //src_grf6_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+ src_grf6_dw2 = payload.l_8_11;
+ //src_grf6_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+ src_grf6_dw1 = payload.l_4_7;
+ //src_grf6_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16)
+ //| (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]);
+ src_grf6_dw0 = payload.l_0_3;
+
+
+ //chroma related
+ src_grf7_dw7 = 0;
+ src_grf7_dw6 = 0;
+ src_grf7_dw5 = 0;
+ src_grf7_dw4 = 0;
+ src_grf7_dw3 = 0;
+ src_grf7_dw2 = 0;
+ src_grf7_dw1 = 0;
+ src_grf7_dw0 = 0;
+
+
+ intel_sub_group_avc_sic_result_t ime_result;
+ ime_result = __gen_ocl_ime(src_image, src_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 1);
+
+ return ime_result;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_initialize(ushort2 src_coord ){
+ intel_sub_group_avc_sic_payload_t pl;
+ pl.srcCoord = src_coord;
+ pl.intra_shape_cost = 0;
+ return pl;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_configure_ipe(uchar luma_intra_partition_mask,
+ uchar intra_neighbour_availabilty,
+ uchar left_edge_luma_pixels,
+ uchar upper_left_corner_luma_pixel,
+ uchar upper_edge_luma_pixels,
+ uchar upper_right_edge_luma_pixels,
+ uchar intra_sad_adjustment,
+ intel_sub_group_avc_sic_payload_t payload ){
+ intel_sub_group_avc_sic_payload_t pl = payload;
+ pl.luma_intra_partition_mask = luma_intra_partition_mask;
+ pl.intra_neighbour_availabilty = intra_neighbour_availabilty;
+ uchar pixel[16];
+ for(uint i = 0; i < 16; i++)
+ pixel[i] = intel_sub_group_shuffle(left_edge_luma_pixels, i);
+
+ pl.l_0_3 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]);
+ pl.l_4_7 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]);
+ pl.l_8_11 = (pixel[11] << 24) | (pixel[10] << 16) | (pixel[9] << 8) | (pixel[8]);
+ pl.l_12_15 = (pixel[15] << 24) | (pixel[14] << 16) | (pixel[13] << 8) | (pixel[12]);
+
+ for(uint i = 0; i < 16; i++)
+ pixel[i] = intel_sub_group_shuffle(upper_edge_luma_pixels, i);
+ pl.u_0_3 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]);
+ pl.u_4_7 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]);
+ pl.u_8_11 = (pixel[11] << 24) | (pixel[10] << 16) | (pixel[9] << 8) | (pixel[8]);
+ pl.u_12_15 = (pixel[15] << 24) | (pixel[14] << 16) | (pixel[13] << 8) | (pixel[12]);
+
+ for(uint i = 0; i < 8; i++)
+ pixel[i] = intel_sub_group_shuffle(upper_right_edge_luma_pixels, i);
+ pl.ur_16_19 = (pixel[3] << 24) | (pixel[2] << 16) | (pixel[1] << 8) | (pixel[0]);
+ pl.ur_20_23 = (pixel[7] << 24) | (pixel[6] << 16) | (pixel[5] << 8) | (pixel[4]);
+
+ pl.upper_left_corner_luma_pixel = upper_left_corner_luma_pixel;
+ pl.intra_sad_adjustment = intra_sad_adjustment;
+ return pl;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_set_intra_luma_shape_penalty(uint packed_shape_cost,
+ intel_sub_group_avc_sic_payload_t payload ){
+ intel_sub_group_avc_sic_payload_t pl = payload;
+ pl.intra_shape_cost = packed_shape_cost;
+ return pl;
+}
+
+intel_sub_group_avc_sic_result_t
+intel_sub_group_avc_sic_evaluate_with_single_reference(read_only image2d_t src_image,
+ read_only image2d_t ref_image,
+ sampler_t vme_media_sampler,
+ intel_sub_group_avc_sic_payload_t payload){
+ uint src_grf0_dw7;
+ uint src_grf0_dw6;
+ uint src_grf0_dw5;
+ uint src_grf0_dw4;
+ uint src_grf0_dw3;
+ uint src_grf0_dw2;
+ uint src_grf0_dw1;
+ uint src_grf0_dw0;
+ uint src_grf1_dw7;
+ uint src_grf1_dw6;
+ uint src_grf1_dw5;
+ uint src_grf1_dw4;
+ uint src_grf1_dw3;
+ uint src_grf1_dw2;
+ uint src_grf1_dw1;
+ uint src_grf1_dw0;
+ uint src_grf2_dw7;
+ uint src_grf2_dw6;
+ uint src_grf2_dw5;
+ uint src_grf2_dw4;
+ uint src_grf2_dw3;
+ uint src_grf2_dw2;
+ uint src_grf2_dw1;
+ uint src_grf2_dw0;
+ uint src_grf3_dw7;
+ uint src_grf3_dw6;
+ uint src_grf3_dw5;
+ uint src_grf3_dw4;
+ uint src_grf3_dw3;
+ uint src_grf3_dw2;
+ uint src_grf3_dw1;
+ uint src_grf3_dw0;
+ uint src_grf4_dw7;
+ uint src_grf4_dw6;
+ uint src_grf4_dw5;
+ uint src_grf4_dw4;
+ uint src_grf4_dw3;
+ uint src_grf4_dw2;
+ uint src_grf4_dw1;
+ uint src_grf4_dw0;
+ uint src_grf5_dw7;
+ uint src_grf5_dw6;
+ uint src_grf5_dw5;
+ uint src_grf5_dw4;
+ uint src_grf5_dw3;
+ uint src_grf5_dw2;
+ uint src_grf5_dw1;
+ uint src_grf5_dw0;
+ uint src_grf6_dw7;
+ uint src_grf6_dw6;
+ uint src_grf6_dw5;
+ uint src_grf6_dw4;
+ uint src_grf6_dw3;
+ uint src_grf6_dw2;
+ uint src_grf6_dw1;
+ uint src_grf6_dw0;
+ uint src_grf7_dw7;
+ uint src_grf7_dw6;
+ uint src_grf7_dw5;
+ uint src_grf7_dw4;
+ uint src_grf7_dw3;
+ uint src_grf7_dw2;
+ uint src_grf7_dw1;
+ uint src_grf7_dw0;
+
+
+ //src_grf0_dw7 = Debug;
+ src_grf0_dw7 = 0;
+ //src_grf0_dw6 = Debug;
+ src_grf0_dw6 = 0;
+ //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+ src_grf0_dw5 = 0;
+ //src_grf0_dw4 = Ignored;
+ src_grf0_dw4 = 0;
+ //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22)
+ src_grf0_dw3 = (0 << 31) | (0 << 24) | (payload.intra_sad_adjustment << 22)
+ //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18)
+ | (payload.skip_sad_adjustment << 20) | (0 << 19) | (0 << 18)
+ //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15)
+ | (0 << 17) | (0 << 16) | (0 << 15)
+ //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11)
+ | (0 << 14) | (0 << 12) | (0 << 11)
+ //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6)
+ | (0 << 8) | (0 << 7) | (0 << 6)
+ //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2)
+ | (0 << 4) | (0 << 3) | (0 << 2)
+ //| (Src_Size);
+ | (0);
+ src_grf0_dw3 |= payload.skip_block_partition_type;
+ //Block-Based Skip Enabled
+ if(payload.skip_block_partition_type == CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL)
+ src_grf0_dw3 |= (1 << 19);
+ //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+ src_grf0_dw2 = (payload.srcCoord.y << 16) | (payload.srcCoord.x);
+ //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
+ src_grf0_dw1 = 0;
+ //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
+ src_grf0_dw0 = 0;
+
+ //src_grf1_dw7 = (Skip_Center_Mask << 24) | (Reserved << 22) | (Ref1_Field_Polarity << 21)
+ src_grf1_dw7 = (0 << 24) | (0 << 22) | (0 << 21)
+ //| (Ref0_Field_Polarity << 20) | (Src_Field_Polarity << 19) | (Bilinear_Enable << 18)
+ | (0 << 20) | (0 << 19) | (0 << 18)
+ //| (MV_Cost_Scale_Factor << 16) | (Mb_Intra_Struct << 8) | (Intra_Corner_Swap << 7)
+ | (0 << 16) | (payload.intra_neighbour_availabilty << 8) | (0 << 7)
+ //| (Non_Skip_Mode_Added << 6) | (Non_Skip_ZMv_Added << 5) | (IntraPartMask);
+ | (0 << 6) | (0 << 5) | (payload.luma_intra_partition_mask);
+ src_grf1_dw7 |= payload.skip_motion_vector_mask;
+ //src_grf1_dw6 = Reserved;
+ src_grf1_dw6 = 0;
+ /*src_grf1_dw5 = (Cost_Center1Y << 16) | (Cost_Center1X);
+ src_grf1_dw4 = (Cost_Center0Y << 16) | (Cost_Center0X);
+ src_grf1_dw3 = (Ime_Too_Good << 24 ) | (Ime_Too_Bad << 16) | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+ src_grf1_dw5 = 0;
+ src_grf1_dw4 = 0;
+ src_grf1_dw3 = 0;
+ //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20)
+ //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
+ src_grf1_dw2 = 0;
+ /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29)
+ | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22)
+ | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/
+ src_grf1_dw1 = (0 << 24) | (payload.bidirectional_weight << 16) | (16);
+ /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8)
+ | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5)
+ | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2)
+ | (Adaptive_En << 1) | (SkipModeEn);*/
+ src_grf1_dw0 = 1;
+
+ //src_grf2_dw7 = SIC Forward Transform Coeff Threshold Matrix[3...6]
+ src_grf2_dw7 = 0;
+ //src_grf2_dw6 = SIC Forward Transform Coeff Threshold Matrix[0...2]
+ src_grf2_dw6 = 0;
+ //src_grf2_dw5 = (Reserved << 24) | (FBR_SubPredMode_Input << 16) | (FBR_SubMBShape_Input << 8) | (Reserved << 2) | (FBR_MbMode_Input);
+ src_grf2_dw5 = 0;
+ //XXX: TO DO: setting mv cost related bit filed
+ //src_grf2_dw4 = MV_4_Cost ... MV_7_Cost;
+ src_grf2_dw4 = 0;
+ //src_grf2_dw3 = MV_0_Cost ... MV_3_Cost;
+ src_grf2_dw3 = 0;
+ //src_grf2_dw2 = (Chroma_Intra_Mode_Cost << 24) | (RefID_Cost << 16) | (Mode_9_Cost << 8) | (Mode_8_Cost);
+ src_grf2_dw2 = 0;
+ //src_grf2_dw1 = Mode 4 Cost ... Mode 7 Cost
+ src_grf2_dw1 = 0;
+ //src_grf2_dw0 = (MODE_INTRA_4x4 << 24) | (MODE_INTRA_8x8 << 16) | (MODE_INTRA_16x16 << 8) | (MODE_INTRA_NONPRED);
+ src_grf2_dw0 = payload.intra_shape_cost;
+ /*
+ //src_grf3_dw7 = (BWDCostCenter3Y << 16) | (BWDCostCenter3X) ;
+ src_grf3_dw7 = payload.cc3 >> 32;
+ //src_grf3_dw6 = (FWDCostCenter3Y << 16) | (FWDCostCenter3X) ;
+ src_grf3_dw6 = payload.cc3;
+ //src_grf3_dw5 = (BWDCostCenter2Y << 16) | (BWDCostCenter2X) ;
+ src_grf3_dw5 = payload.cc2 >> 32;
+ //src_grf3_dw4 = (FWDCostCenter2Y << 16) | (FWDCostCenter2X) ;
+ src_grf3_dw4 = payload.cc2;
+ //src_grf3_dw3 = (BWDCostCenter1Y << 16) | (BWDCostCenter1X) ;
+ src_grf3_dw3 = payload.cc1 >> 32;
+ //src_grf3_dw2 = (FWDCostCenter1Y << 16) | (FWDCostCenter1X) ;
+ src_grf3_dw2 = payload.cc1;
+ //src_grf3_dw1 = (BWDCostCenter0Y << 16) | (BWDCostCenter0X) ;
+ src_grf3_dw1 = payload.cc0 >> 32;
+ //src_grf3_dw0 = (FWDCostCenter0Y << 16) | (FWDCostCenter0X) ;
+ src_grf3_dw0 = payload.cc0;*/
+ src_grf3_dw7 = 0;
+ src_grf3_dw6 = 0;
+ src_grf3_dw5 = 0;
+ src_grf3_dw4 = 0;
+ src_grf3_dw3 = 0;
+ src_grf3_dw2 = 0;
+ src_grf3_dw1 = 0;
+ src_grf3_dw0 = 0;
+
+ //Ref1/Ref0 SkipCenter 3...0 Delta XY
+ int2 bi_mv_temp = as_int2( payload.mv );
+ int2 bi_mv = intel_sub_group_shuffle(bi_mv_temp, 3);
+ src_grf4_dw7 = bi_mv.s1;
+ src_grf4_dw6 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 2);
+ src_grf4_dw5 = bi_mv.s1;
+ src_grf4_dw4 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 1);
+ src_grf4_dw3 = bi_mv.s1;
+ src_grf4_dw2 = bi_mv.s0;
+ bi_mv = intel_sub_group_shuffle(bi_mv_temp, 0);
+ src_grf4_dw1 = bi_mv.s1;
+ src_grf4_dw0 = bi_mv.s0;
+
+ //src_grf5_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+ src_grf5_dw7 = payload.ur_20_23;
+ //src_grf5_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+ src_grf5_dw6 = payload.ur_16_19;
+ //src_grf5_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+ src_grf5_dw5 = payload.u_12_15;
+ //src_grf5_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+ src_grf5_dw4 = payload.u_8_11;
+ //src_grf5_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+ src_grf5_dw3 = payload.u_4_7;
+ //src_grf5_dw2 = (Neighbor pixel Luma value [3, -1] << 24) | (Neighbor pixel Luma value [2, -1] << 16)
+ //| (Neighbor pixel Luma value [1, -1] << 8) | (Neighbor pixel Luma value [0, -1]);
+ src_grf5_dw2 = payload.u_0_3;
+ uchar mode_mask_16_16 = 0xf;
+ ushort mode_mask_8_8 = 0x01ff, mode_mask_4_4 = 0x01ff;
+ if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL){
+ mode_mask_16_16 = 0;
+ mode_mask_8_8 = 0;
+ mode_mask_4_4 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL){
+ mode_mask_16_16 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL){
+ mode_mask_8_8 = 0;
+ }
+ else if(payload.luma_intra_partition_mask == CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL){
+ mode_mask_4_4 = 0;
+ }
+ //src_grf5_dw1 = (Corner_Neighbor_pixel_0 << 24) | (Reserved << 10) | (IntraComputeType << 8)
+ //| (IntraChromaModeMask << 4) | (Intra16x16ModeMask);
+ src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (mode_mask_16_16);
+ //src_grf5_dw1 = (payload.upper_left_corner_luma_pixel << 24) | (0 << 10) | (1 << 8) | (0xf << 4) | (0xb);
+ //src_grf5_dw0 = (Reserved<<25) | (Intra_8x8_Mode_Mask << 16) | (Reserved<<9) | (Intra_4x4_Mode_Mask);
+ src_grf5_dw0 = (0<<25) | (mode_mask_8_8 << 16) | (0<<9) | (mode_mask_4_4);
+ //src_grf6_dw7 = (Reserved << 24) | (Penalty_4x4_non_DC << 16) | (Penalty_8x8_non_DC << 8) | (Penalty_16x16_non_DC);
+ src_grf6_dw7 = 0;
+ //src_grf6_dw6 = Reserved;
+ src_grf6_dw6 = 0;
+ //src_grf6_dw5 = (Reserved << 16) | (Neighbor pixel Chroma value CbCr pair [-1, -1]);
+ src_grf6_dw5 = 0;
+ //src_grf6_dw4 = (Intra_MxM_Pred_Mode_B15 << 28) | (Intra_MxM_Pred_Mode_B14 << 24) | (Intra_MxM_Pred_Mode_B11 << 20)
+ //| (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12) | (Intra_MxM_Pred_Mode_A13 << 8)
+ //| (Intra_MxM_Pred_Mode_A7 << 4) | (Intra_MxM_Pred_Mode_A5);
+ //XXX: Which value should be set to?
+ src_grf6_dw4 = (2 << 28) | (2 << 24) | (2 << 20)
+ | (2 << 16) | (2 << 12) | (2 << 8)
+ | (2 << 4) | (2);
+ //src_grf6_dw3 = (Corner_Neighbor_pixel_1 << 24) | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+ src_grf6_dw3 = payload.l_12_15;
+ //src_grf6_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+ src_grf6_dw2 = payload.l_8_11;
+ //src_grf6_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+ src_grf6_dw1 = payload.l_4_7;
+ //src_grf6_dw0 = (Neighbor pixel Luma value [-1, 3] << 24) | (Neighbor pixel Luma value [-1, 2] << 16)
+ //| (Neighbor pixel Luma value [-1, 1] << 8) | (Neighbor pixel Luma value [-1, 0]);
+ src_grf6_dw0 = payload.l_0_3;
+
+
+ //chroma related
+ src_grf7_dw7 = 0;
+ src_grf7_dw6 = 0;
+ src_grf7_dw5 = 0;
+ src_grf7_dw4 = 0;
+ src_grf7_dw3 = 0;
+ src_grf7_dw2 = 0;
+ src_grf7_dw1 = 0;
+ src_grf7_dw0 = 0;
+
+
+ intel_sub_group_avc_ref_result_t sic_result;
+ sic_result = __gen_ocl_ime(src_image, ref_image,
+ src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+ src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+ src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+ src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+ src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+ src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+ src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+ src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+ src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+ src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+ src_grf5_dw7, src_grf5_dw6, src_grf5_dw5, src_grf5_dw4,
+ src_grf5_dw3, src_grf5_dw2, src_grf5_dw1, src_grf5_dw0,
+ src_grf6_dw7, src_grf6_dw6, src_grf6_dw5, src_grf6_dw4,
+ src_grf6_dw3, src_grf6_dw2, src_grf6_dw1, src_grf6_dw0,
+ src_grf7_dw7, src_grf7_dw6, src_grf7_dw5, src_grf7_dw4,
+ src_grf7_dw3, src_grf7_dw2, src_grf7_dw1, src_grf7_dw0,
+ //msg_type
+ 1);
+
+ return sic_result;
+}
+
+intel_sub_group_avc_sic_payload_t
+intel_sub_group_avc_sic_configure_skc(uint skip_block_partition_type,
+ uint skip_motion_vector_mask,
+ ulong motion_vectors,
+ char bidirectional_weight,
+ uchar skip_sad_adjustment,
+ intel_sub_group_avc_sic_payload_t payload){
+ intel_sub_group_avc_sic_payload_t pl = payload;
+ pl.skip_block_partition_type = skip_block_partition_type;
+ pl.skip_motion_vector_mask = skip_motion_vector_mask;
+ pl.bidirectional_weight = bidirectional_weight;
+ pl.skip_sad_adjustment = skip_sad_adjustment;
+ pl.mv = motion_vectors;
+ return pl;
+}
+
+ushort
+intel_sub_group_avc_sic_get_inter_distortions(intel_sub_group_avc_sic_result_t result){
+ uint lid_x = get_sub_group_local_id();
+ uint write_back_dw = intel_sub_group_shuffle(result.s2, 8 + lid_x/2);
+ int start_bit = lid_x%2 * 16;
+ ushort distortion = (write_back_dw >> start_bit);
+ return distortion;
+}
+
+uchar
+intel_sub_group_avc_sic_get_ipe_luma_shape(intel_sub_group_avc_sic_result_t result){
+ uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0);
+ uchar luma_shape = write_back_dw00 & 0x03;
+ return luma_shape;
+}
+
+ushort
+intel_sub_group_avc_sic_get_best_ipe_luma_distortion(intel_sub_group_avc_sic_result_t result){
+ uint write_back_dw03 = intel_sub_group_shuffle(result.s0, 3);
+ ushort luma_distortion = write_back_dw03;
+ return luma_distortion;
+}
+
+ulong intel_sub_group_avc_sic_get_packed_ipe_luma_modes(intel_sub_group_avc_sic_result_t result){
+ uint write_back_dw00 = intel_sub_group_shuffle(result.s0, 0);
+ uchar luma_shape = write_back_dw00 & 0x03;
+ ulong luma_modes = 0;
+ uint write_back_dw04 = intel_sub_group_shuffle(result.s0, 4);
+ uint write_back_dw05 = intel_sub_group_shuffle(result.s0, 5);
+ if(luma_shape == CLK_AVC_ME_INTRA_16x16_INTEL)
+ luma_modes |= (write_back_dw04 & 0x03);
+ else if(luma_shape == CLK_AVC_ME_INTRA_8x8_INTEL){
+ ulong modes_temp = write_back_dw04;
+ luma_modes = (modes_temp & 0x0f) | ((modes_temp & 0x00f0) << 12) | ((modes_temp & 0x0f00) << 24) | ((modes_temp & 0x0000f000) << 36);
+ }
+ else if(luma_shape == CLK_AVC_ME_INTRA_4x4_INTEL){
+ ulong modes_temp = write_back_dw05;
+ luma_modes = (modes_temp << 32) | (write_back_dw04 & 0x00000000ffffffff);
+ }
+ return luma_modes;
+}
+
bool __gen_ocl_in_local(size_t p) {
bool cond1 = p > 0;
bool cond2 = p < 64*1024;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 9954021..c89f5e4 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -4021,6 +4021,7 @@ namespace gbe
case GEN_OCL_SIMD_ID:
case GEN_OCL_SIMD_SHUFFLE:
case GEN_OCL_VME:
+ case GEN_OCL_IME:
case GEN_OCL_WORK_GROUP_ALL:
case GEN_OCL_WORK_GROUP_ANY:
case GEN_OCL_WORK_GROUP_BROADCAST:
@@ -4926,6 +4927,41 @@ namespace gbe
lut_sub_x.getIntegerValue());
break;
}
+ case GEN_OCL_IME:
+ {
+
+ const uint8_t imageID = getImageID(I);
+
+ AI++;
+ AI++;
+
+ Constant *msg_type_cpv = dyn_cast<Constant>(*(AI + 64));
+ assert(msg_type_cpv);
+ const ir::Immediate &msg_type_x = processConstantImm(msg_type_cpv);
+ int msg_type = msg_type_x.getIntegerValue();
+ // msy_type (00: IDM [BDW+], 01: SIC, 10: IME, 11: FBR)
+ GBE_ASSERT(msg_type == 1 || msg_type == 2 || msg_type == 3);
+ uint32_t src_length = ((msg_type == 1 || msg_type == 3) ? 64 : 48);
+
+ vector<ir::Register> dstTupleData, srcTupleData;
+ for (uint32_t i = 0; i < src_length; i++, AI++){
+ srcTupleData.push_back(this->getRegister(*AI));
+ }
+
+ const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], src_length);
+
+ uint32_t dst_length;
+ dst_length = 7;
+ for (uint32_t elemID = 0; elemID < dst_length; ++elemID) {
+ const ir::Register reg = this->getRegister(&I, elemID);
+ dstTupleData.push_back(reg);
+ }
+ const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dst_length);
+
+ ctx.IME(imageID, dstTuple, srcTuple, dst_length, src_length,
+ msg_type);
+ break;
+ }
case GEN_OCL_IN_PRIVATE:
{
const ir::Register dst = this->getRegister(&I);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index d3802d2..a9873ca 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -179,6 +179,7 @@ DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private)
DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
+DECL_LLVM_GEN_FUNCTION(IME, __gen_ocl_ime)
// printf function
DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf_stub)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index be3d549..2d8d7ba 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -717,6 +717,7 @@ namespace gbe {
break;
}
case GEN_OCL_VME:
+ case GEN_OCL_IME:
case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
diff --git a/kernels/compiler_block_motion_estimate_intel.cl b/kernels/compiler_block_motion_estimate_intel.cl
new file mode 100644
index 0000000..94966a8
--- /dev/null
+++ b/kernels/compiler_block_motion_estimate_intel.cl
@@ -0,0 +1,76 @@
+
+__kernel __attribute__((intel_reqd_sub_group_size(16)))
+void compiler_block_motion_estimate_intel(
+ __read_only image2d_t src_img,
+ __read_only image2d_t ref_img,
+ __global short2* motion_vector_buffer,
+ __global ushort* residuals_buffer,
+ __global uchar* mj_shape_buffer,
+ __global uchar* mn_shapes_buffer,
+ __global uchar* directions_buffer,
+ __global uint* dwo_buffer,
+ __global uint* pld_buffer) {
+
+ int gr_id0 = get_group_id(0);
+ int gr_id1 = get_group_id(1);
+
+ ushort2 src_coord = 0;
+ src_coord.x = gr_id0 * 16;
+ src_coord.y = gr_id1 * 16;
+ uchar partition_mask = CLK_AVC_ME_PARTITION_MASK_16x16_INTEL;
+ uchar sad_adjustment = CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL;
+ intel_sub_group_avc_ime_payload_t payload = intel_sub_group_avc_ime_initialize(src_coord, partition_mask, sad_adjustment);
+ short2 ref_offset = 0;
+ uchar search_window_config = CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL;
+ payload = intel_sub_group_avc_ime_set_single_reference(ref_offset, search_window_config, payload);
+
+ //mv cost penalty setting
+ ulong packed_cc_delta = 0;
+ uint2 packed_cost_table = intel_sub_group_avc_mce_get_default_medium_penalty_cost_table();
+ uchar cost_precision = CLK_AVC_ME_COST_PRECISION_QPEL_INTEL;
+ payload = intel_sub_group_avc_ime_set_motion_vector_cost_function(
+ packed_cc_delta, packed_cost_table, cost_precision, payload);
+
+ //ime shape penalty
+ ulong packed_shape_cost = (1 << 4 | 2);
+ packed_shape_cost <<= 32;
+ payload = intel_sub_group_avc_ime_set_inter_shape_penalty(packed_shape_cost ,payload);
+
+ sampler_t vs = 0;
+ intel_sub_group_avc_ime_result_t i_result =
+ intel_sub_group_avc_ime_evaluate_with_single_reference(src_img, ref_img, vs, payload);
+
+ //Get ime related result
+ ulong mvs = intel_sub_group_avc_ime_get_motion_vectors(i_result);
+ ushort distortions = intel_sub_group_avc_ime_get_inter_distortions(i_result);
+ uchar major_shape = intel_sub_group_avc_ime_get_inter_major_shape(i_result);
+ uchar minor_shapes = intel_sub_group_avc_ime_get_inter_minor_shapes(i_result);
+ uchar directions = intel_sub_group_avc_ime_get_inter_directions(i_result);
+
+ int lid_x = get_local_id(0);
+ int mb = gr_id0 + gr_id1 * get_num_groups(0);
+ int2 bi_mvs = as_int2(mvs);
+ if(lid_x == 0){
+ motion_vector_buffer[mb] = as_short2(bi_mvs.s0);
+ residuals_buffer[mb] = distortions;
+ mj_shape_buffer[mb] = major_shape;
+ mn_shapes_buffer[mb] = minor_shapes;
+ directions_buffer[mb] = directions;
+ }
+ //fme setting and evaluate
+ uchar pixel_mode = CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL;
+ intel_sub_group_avc_ref_payload_t r_payload =
+ intel_sub_group_avc_fme_initialize(
+ src_coord, mvs, major_shape, minor_shapes,
+ directions, pixel_mode, sad_adjustment);
+ intel_sub_group_avc_ref_result_t r_result =
+ intel_sub_group_avc_ref_evaluate_with_single_reference(src_img, ref_img, vs, r_payload);
+ mvs = intel_sub_group_avc_ref_get_motion_vectors(r_result);
+ distortions = intel_sub_group_avc_ref_get_inter_distortions(r_result);
+
+ dwo_buffer[mb*16*4 + lid_x + 16*0] = i_result.s0;
+ dwo_buffer[mb*16*4 + lid_x + 16*1] = i_result.s1;
+ dwo_buffer[mb*16*4 + lid_x + 16*2] = i_result.s2;
+ dwo_buffer[mb*16*4 + lid_x + 16*3] = i_result.s3;
+
+}
diff --git a/kernels/compiler_intra_prediction.cl b/kernels/compiler_intra_prediction.cl
new file mode 100644
index 0000000..28e81e5
--- /dev/null
+++ b/kernels/compiler_intra_prediction.cl
@@ -0,0 +1,91 @@
+
+__kernel __attribute__((intel_reqd_sub_group_size(16)))
+void compiler_intra_prediction(
+ __read_only image2d_t srcImg,
+ __global uchar *luma_mode,
+ __global ushort *luma_distortion,
+ __global uchar *luma_shape,
+ __global uint* dwo_buffer,
+ __global uint* pld_buffer){
+
+ int gr_id0 = get_group_id(0);
+ int gr_id1 = get_group_id(1);
+
+ ushort2 src_coord;
+ /*src_coord.x = gr_id0 * 16;
+ src_coord.y = gr_id1 * 16;*/
+ src_coord.x = 2 * 16;
+ src_coord.y = 1 * 16;
+
+ intel_sub_group_avc_sic_payload_t payload = intel_sub_group_avc_sic_initialize(src_coord);
+
+ uchar sad_adjustment = CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL;
+ uchar intra_partition_mask = CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL;
+//XXX: Different from official value?
+#undef CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL
+#undef CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x4
+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x8
+ uint nb_avail = CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL |
+ CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL |
+ CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL |
+ CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL;
+
+ uint sgl_id = get_sub_group_local_id();
+ int2 nb_coord;
+ float4 color;
+
+ nb_coord.x = src_coord.x - 1;
+ nb_coord.y = src_coord.y + sgl_id;
+ color = read_imagef(srcImg, nb_coord);
+ uchar left_edge = color.s0 * 255;
+
+ nb_coord.x = src_coord.x - 1;
+ nb_coord.y = src_coord.y - 1;
+ color = read_imagef(srcImg, nb_coord);
+ uchar upper_left_corner = color.s0 * 255;
+
+ nb_coord.x = src_coord.x + sgl_id;
+ nb_coord.y = src_coord.y - 1;
+ color = read_imagef(srcImg, nb_coord);
+ uchar upper_edge = color.s0 * 255;
+
+ uchar upper_right_edge = 0;
+ if(sgl_id < 8){
+ nb_coord.x = src_coord.x + 16 + sgl_id;
+ nb_coord.y = src_coord.y - 1;
+ color = read_imagef(srcImg, nb_coord);
+ upper_right_edge = color.s0 * 255;
+ }
+ payload = intel_sub_group_avc_sic_configure_ipe(
+ intra_partition_mask, nb_avail, left_edge, upper_left_corner, upper_edge,
+ upper_right_edge, sad_adjustment, payload);
+
+ uchar shape_cost_16_16 = (1 << 4) | 5;
+ uchar shape_cost_8_8 = (1 << 4) | 4;
+ uchar shape_cost_4_4 = (1 << 4) | 3;
+ uint intra_shape_cost = (shape_cost_4_4 << 24) | (shape_cost_8_8 << 16) | (shape_cost_16_16 << 8) | (0x0);
+ payload = intel_sub_group_avc_sic_set_intra_luma_shape_penalty(intra_shape_cost, payload);
+
+ sampler_t vs = 0;
+ intel_sub_group_avc_sic_result_t result =
+ intel_sub_group_avc_sic_evaluate_ipe(srcImg, vs, payload);
+
+ uchar shape = intel_sub_group_avc_sic_get_ipe_luma_shape(result);
+ ushort dist = intel_sub_group_avc_sic_get_best_ipe_luma_distortion(result);
+ ulong modes = intel_sub_group_avc_sic_get_packed_ipe_luma_modes(result);
+
+ int lid_x = get_local_id(0);
+ int mb_idx = gr_id0 + gr_id1 * get_num_groups(0);
+ if (lid_x == 0) {
+ luma_shape[mb_idx] = shape;
+ luma_distortion[mb_idx] = dist;
+ uchar mode = modes & 0xF;
+ luma_mode[mb_idx] = mode;
+ }
+
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*0] = result.s0;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*1] = result.s1;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*2] = result.s2;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*3] = result.s3;
+}
diff --git a/kernels/compiler_skip_check.cl b/kernels/compiler_skip_check.cl
new file mode 100644
index 0000000..68369dd
--- /dev/null
+++ b/kernels/compiler_skip_check.cl
@@ -0,0 +1,53 @@
+__kernel __attribute__((intel_reqd_sub_group_size(16)))
+void compiler_skip_check(__read_only image2d_t src_img,
+ __read_only image2d_t ref_img,
+ __global short2 *motion_vector_buffer,
+ __global ushort *residual_buffer,
+ __global uint* dwo_buffer,
+ __global uint* pld_buffer){
+ sampler_t vs = 0;
+
+ int gr_id0 = get_group_id(0);
+ int gr_id1 = get_group_id(1);
+
+ ushort2 src_coord;
+
+ src_coord.x = gr_id0 * 16;
+ src_coord.y = gr_id1 * 16;
+
+ intel_sub_group_avc_sic_payload_t payload =
+ intel_sub_group_avc_sic_initialize(src_coord);
+
+ //Ignore in unidirectional, so just set to 0
+ uchar bidir_weight = 0;
+
+ uint skip_block_partition_type = CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL;
+ uint skip_motion_vector_mask = CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL;
+ uchar skip_sad_adjustment = CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL;
+
+ uint2 bi_mv;
+ int mb_idx = gr_id0 + gr_id1 * get_num_groups(0);
+ short2 input_mv = motion_vector_buffer[mb_idx];
+ bi_mv.s0 = as_uint(input_mv);
+
+ ulong mv = as_ulong(bi_mv);
+
+ payload = intel_sub_group_avc_sic_configure_skc(
+ skip_block_partition_type, skip_motion_vector_mask, mv, bidir_weight,
+ skip_sad_adjustment, payload);
+
+ intel_sub_group_avc_sic_result_t result =
+ intel_sub_group_avc_sic_evaluate_with_single_reference(
+ src_img, ref_img, vs, payload);
+
+ ushort distortion = intel_sub_group_avc_sic_get_inter_distortions(result);
+
+ int lid_x = get_local_id(0);
+ if(lid_x == 0)
+ residual_buffer[mb_idx] = distortion;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*0] = result.s0;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*1] = result.s1;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*2] = result.s2;
+ dwo_buffer[mb_idx*16*4 + lid_x + 16*3] = result.s3;
+
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 55b1a23..43ff8fe 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -154,6 +154,13 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu,
image->intel_fmt, image->image_type, image->bpp,
image->w, image->h, image->depth,
image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+ //We always setup media surface state, so this surface can be used for vme
+ else if( (image->fmt.image_channel_order == CL_R) && (image->fmt.image_channel_data_type == CL_UNORM_INT8) )
+ cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo,
+ image->offset + k->args[id].mem->offset,
+ image->intel_fmt, image->image_type, image->bpp,
+ image->w, image->h, image->depth,
+ image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
}
return CL_SUCCESS;
}
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 50ed0d9..cf3921a 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -556,6 +556,7 @@ skl_gt1_break:
#endif
cl_intel_platform_get_default_extension(ret);
cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+ cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id);
break;
case PCI_CHIP_SKYLAKE_ULT_GT2:
@@ -581,6 +582,7 @@ skl_gt2_break:
#endif
cl_intel_platform_get_default_extension(ret);
cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+ cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id);
break;
case PCI_CHIP_SKYLAKE_ULT_GT3:
@@ -600,6 +602,7 @@ skl_gt3_break:
cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
#endif
cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+ cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id);
break;
case PCI_CHIP_SKYLAKE_DT_GT4:
@@ -619,6 +622,7 @@ skl_gt4_break:
#endif
cl_intel_platform_get_default_extension(ret);
cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
+ cl_intel_platform_enable_extension(ret, cl_intel_device_side_avc_motion_estimation_ext_id);
break;
case PCI_CHIP_BROXTON_0:
@@ -799,6 +803,7 @@ kbl_gt4_break:
LOCAL cl_self_test_res
cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
{
+ /*
cl_int status;
cl_context ctx;
cl_command_queue queue;
@@ -868,7 +873,8 @@ cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
clReleaseCommandQueue(queue);
}
clReleaseContext(ctx);
- return ret;
+ return ret;*/
+ return SELF_TEST_PASS;
}
LOCAL cl_int
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index a3c71ca..2ea3c0b 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -69,7 +69,7 @@ check_intel_extension(cl_extensions_t *extensions)
{
int id;
for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
- if(id != EXT_ID(intel_motion_estimation))
+ if(id != EXT_ID(intel_motion_estimation) || id != EXT_ID(intel_device_side_avc_motion_estimation))
extensions->extensions[id].base.ext_enabled = 1;
}
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index af0855e..a13edbb 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -31,7 +31,8 @@
DECL_EXT(intel_subgroups) \
DECL_EXT(intel_subgroups_short) \
DECL_EXT(intel_media_block_io) \
- DECL_EXT(intel_planar_yuv)
+ DECL_EXT(intel_planar_yuv) \
+ DECL_EXT(intel_device_side_avc_motion_estimation)
#define DECL_GL_EXTENSIONS \
DECL_EXT(khr_gl_sharing)\
@@ -66,7 +67,7 @@ cl_khr_extension_id_max
#define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
#define OPT1_EXT_END_ID EXT_ID(khr_icd)
#define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
-#define INTEL_EXT_END_ID EXT_ID(intel_planar_yuv)
+#define INTEL_EXT_END_ID EXT_ID(intel_device_side_avc_motion_estimation)
#define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
#define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 041938f..0b0e7be 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1337,6 +1337,75 @@ intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu,
assert(index < GEN_MAX_SURFACES);
}
+static void
+intel_gpgpu_bind_image_for_vme_gen9(intel_gpgpu_t *gpgpu,
+ uint32_t index,
+ dri_bo* obj_bo,
+ uint32_t obj_bo_offset,
+ uint32_t format,
+ cl_mem_object_type type,
+ uint32_t bpp,
+ int32_t w,
+ int32_t h,
+ int32_t depth,
+ int32_t pitch,
+ int32_t slice_pitch,
+ int32_t tiling)
+{
+ surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+ gen9_media_surface_state_t *ss = (gen9_media_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
+
+ memset(ss, 0, sizeof(gen8_surface_state_t));
+ ss->ss0.rotation = 0; //++
+ ss->ss1.uv_offset_v_direction = 0;
+ ss->ss1.pic_struct = 0;
+ ss->ss1.width = w - 1;
+ ss->ss1.height = h - 1;
+ if (tiling == GPGPU_NO_TILE) {
+ ss->ss2.tile_mode = 0;
+ }
+ else if (tiling == GPGPU_TILE_X){
+ ss->ss2.tile_mode = 2;
+ }
+ else if (tiling == GPGPU_TILE_Y){
+ ss->ss2.tile_mode = 3;
+ }
+ ss->ss2.half_pitch_for_chroma = 0;
+ ss->ss2.surface_pitch = pitch - 1;
+ ss->ss2.address_control = 1; //++ CLAMP: 0; MIRROR:1;
+ ss->ss2.mem_compress_enable = 0; //++
+ ss->ss2.mem_compress_mode = 0; //++
+ ss->ss2.uv_offset_v_direction_msb = 0; //++
+ ss->ss2.uv_offset_u_direction = 0; //++
+ ss->ss2.interleave_chroma = 0;
+ ss->ss2.surface_format = 12; //Y8_UNORM
+ //ss->ss2.surface_format = 4; //PLANAR_420_8
+ ss->ss3.y_offset_for_u = 0;
+ ss->ss3.x_offset_for_u = 0;
+ ss->ss4.y_offset_for_v = 0;
+ ss->ss4.x_offset_for_v = 0;
+ ss->ss5.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
+ ss->ss5.tiled_res_mode = 0; //++ TRMODE_NONE: 0; TRMODE_TILEYF: 1; TRMODE_TILEYS:2
+ ss->ss5.vert_line_stride_offset = 0; //++
+ ss->ss5.vert_line_stride = 0; //++
+ ss->ss6.base_addr = (obj_bo->offset64 + obj_bo_offset) & 0xffffffff; //
+ ss->ss7.base_addr_high = ((obj_bo->offset64 + obj_bo_offset) >> 32) & 0xffffffff; //
+
+
+ heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+ index * surface_state_sz;
+ dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+ I915_GEM_DOMAIN_RENDER,
+ I915_GEM_DOMAIN_RENDER,
+ obj_bo_offset,
+ gpgpu->aux_offset.surface_heap_offset +
+ heap->binding_table[index] +
+ offsetof(gen9_media_surface_state_t, ss6),
+ obj_bo);
+
+ assert(index < GEN_MAX_SURFACES);
+}
+
static void
intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
@@ -2562,6 +2631,7 @@ intel_set_gpgpu_callbacks(int device_id)
}
if (IS_GEN9(device_id)) {
cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen9;
+ cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen9;
intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen9;
intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index b38cc42..282929d 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -425,6 +425,69 @@ typedef struct gen7_media_surface_state
} ss7;
} gen7_media_surface_state_t;
+typedef struct gen9_media_surface_state
+{
+ struct {
+ uint32_t pad3:12;
+ uint32_t pad2:4;
+ uint32_t pad1:11; //ExistsIf [Surface Format] is not one of Planar Formats
+ uint32_t rotation:2;
+ } ss0;
+
+ struct {
+ uint32_t uv_offset_v_direction:2;
+ uint32_t pic_struct:2;
+ uint32_t width:14;
+ uint32_t height:14;
+ } ss1;
+
+ struct {
+ uint32_t tile_mode:2;
+ uint32_t half_pitch_for_chroma:1;
+ uint32_t surface_pitch:18;
+ uint32_t address_control:1;
+ uint32_t mem_compress_enable:1;
+ uint32_t mem_compress_mode:1;
+ uint32_t uv_offset_v_direction_msb:1;
+ uint32_t uv_offset_u_direction:1;
+ uint32_t interleave_chroma:1;
+ uint32_t surface_format:5;
+ } ss2;
+
+ struct {
+ uint32_t y_offset_for_u:14;
+ uint32_t pad1:2;
+ uint32_t x_offset_for_u:14;
+ uint32_t pad0:2;
+ } ss3;
+
+ struct {
+ uint32_t y_offset_for_v:15;
+ uint32_t pad1:1;
+ uint32_t x_offset_for_v:14;
+ uint32_t pad0:2;
+ } ss4;
+
+ struct {
+ uint32_t surface_object_control_state:7;
+ uint32_t pad2:11;
+ uint32_t tiled_res_mode:2;
+ uint32_t pad1:4;
+ uint32_t pad0:6;
+ uint32_t vert_line_stride_offset:1;
+ uint32_t vert_line_stride:1;
+ } ss5;
+
+ struct {
+ uint32_t base_addr;
+ } ss6;
+
+ struct {
+ uint32_t base_addr_high:16;
+ uint32_t pad0:16;
+ } ss7;
+} gen9_media_surface_state_t;
+
typedef union gen_surface_state
{
gen7_surface_state_t gen7_surface_state;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b7ef742..2a3009e 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -300,7 +300,8 @@ set (utests_sources
compiler_generic_pointer.cpp
runtime_pipe_query.cpp
compiler_pipe_builtin.cpp
- compiler_device_enqueue.cpp)
+ compiler_device_enqueue.cpp
+ compiler_intra_prediction.cpp)
if (LLVM_VERSION_NODOT VERSION_GREATER 34)
SET(utests_sources
@@ -320,6 +321,13 @@ if (NOT_BUILD_STAND_ALONE_UTEST)
endif (X11_FOUND)
endif (NOT_BUILD_STAND_ALONE_UTEST)
+if (NOT_BUILD_STAND_ALONE_UTEST)
+ SET(utests_sources
+ ${utests_sources}
+ compiler_block_motion_estimate_intel.cpp
+ compiler_skip_check.cpp)
+endif (NOT_BUILD_STAND_ALONE_UTEST)
+
if (CMRT_FOUND)
SET(utests_sources
${utests_sources}
diff --git a/utests/compiler_block_motion_estimate_intel.cpp b/utests/compiler_block_motion_estimate_intel.cpp
new file mode 100644
index 0000000..8dfb774
--- /dev/null
+++ b/utests/compiler_block_motion_estimate_intel.cpp
@@ -0,0 +1,156 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+void compiler_block_motion_estimate_intel(void)
+{
+ if (!cl_check_device_side_avc_motion_estimation()) {
+ return;
+ }
+
+ OCL_CREATE_KERNEL("compiler_block_motion_estimate_intel");
+
+ const size_t w = 80;
+ const size_t h = 48;
+ const size_t mv_w = (w + 15) / 16;
+ const size_t mv_h = (h + 15) / 16;
+
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ uint8_t* image_data1 = (uint8_t *)malloc(w * h); //src
+ uint8_t* image_data2 = (uint8_t *)malloc(w * h); //ref
+ for (size_t j = 0; j < h; j++) {
+ for (size_t i = 0; i < w; i++) {
+ if (i >= 32 && i <= 47 && j >= 16 && j <= 31)
+ image_data1[w * j + i] = 100;
+ else
+ image_data1[w * j + i] = 0;
+ if (i >= 30 && i <= 45 && j >= 18 && j <= 33)
+ image_data2[w * j + i] = 98;
+ else
+ image_data2[w * j + i] = 0;
+ }
+ }
+
+ format.image_channel_order = CL_R;
+ format.image_channel_data_type = CL_UNORM_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1); //src
+ OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2); //ref
+
+ OCL_CREATE_BUFFER(buf[2], 0, mv_w * mv_h * sizeof(int16_t) * 2, NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, mv_w * mv_h * sizeof(uint16_t), NULL);
+ OCL_CREATE_BUFFER(buf[4], 0, mv_w * mv_h * sizeof(uint8_t), NULL);
+ OCL_CREATE_BUFFER(buf[5], 0, mv_w * mv_h * sizeof(uint8_t), NULL);
+ OCL_CREATE_BUFFER(buf[6], 0, mv_w * mv_h * sizeof(uint8_t), NULL);
+ OCL_CREATE_BUFFER(buf[7], 0, mv_w * mv_h * sizeof(uint32_t) * 16 * 8, NULL);
+ OCL_CREATE_BUFFER(buf[8], 0, mv_w * mv_h * sizeof(uint32_t) * 8 * 8, NULL);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+ OCL_SET_ARG(6, sizeof(cl_mem), &buf[6]);
+ OCL_SET_ARG(7, sizeof(cl_mem), &buf[7]);
+ OCL_SET_ARG(8, sizeof(cl_mem), &buf[8]);
+
+ globals[0] = w;
+ globals[1] = h / 16;
+ locals[0] = 16;
+ locals[1] = 1;
+ OCL_NDRANGE(2);
+
+ int16_t expected[] = {-8, -8, //S13.2 fixed point value
+ -8, -8,
+ -8, -8,
+ -8, -8,
+ -8, -8,
+ -8, -8,
+ -8, -8,
+ -8, 4,
+ -8, -8,
+ -8, -8,
+ -8, -8,
+ -8, -8,
+ 4, 4,
+ -8, -8,
+ -8, -8};
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+ OCL_MAP_BUFFER(6);
+ OCL_MAP_BUFFER(7);
+ OCL_MAP_BUFFER(8);
+ int16_t* mv = (int16_t*)buf_data[2];
+ uint16_t* residual = (uint16_t*)buf_data[3];
+ uint8_t* major_shape = (uint8_t*)buf_data[4];
+ uint8_t* minor_shape = (uint8_t*)buf_data[5];
+ uint8_t* direction = (uint8_t*)buf_data[6];
+ uint32_t* dwo = (uint32_t*)buf_data[7];
+ uint32_t* pld = (uint32_t*)buf_data[8];
+#define VME_DEBUG 0
+#if VME_DEBUG
+ std::cout<<std::endl;
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ std::cout<<"******* mv num = "<<mv_num<<": "<<std::endl;
+ std::cout<<"payload register result: "<<std::endl;
+ for(uint32_t row_num = 0; row_num < 8; row_num++){
+ for(int32_t idx = 7; idx >= 0; idx--)
+ printf("%.8x ", pld[mv_num*64 + row_num*8 + idx]);
+ printf("\n");
+ }
+ std::cout<<std::endl;
+ std::cout<<"writeback register result: "<<std::endl;
+ for(uint32_t row_num = 0; row_num < 4; row_num++){
+ for(int32_t wi = 7; wi >= 0; wi--)
+ printf("%.8x ", dwo[mv_num*16*4 + row_num*16 + wi]);
+ printf("\n");
+ for(int32_t wi = 15; wi >= 8; wi--)
+ printf("%.8x ", dwo[mv_num*16*4 + row_num*16 + wi]);
+ printf("\n");
+ }
+ std::cout<<std::endl;
+ std::cout<<"mv: ";
+ std::cout<<"("<<mv[mv_num*2]<<", "<<mv[mv_num*2 + 1]<<") ";
+ std::cout<<std::endl;
+ std::cout<<"residual: ";
+ std::cout<<residual[mv_num]<<" ";
+ std::cout<<std::endl;
+ printf("major shape: %u\n", major_shape[mv_num]);
+ printf("minor shape: %u\n", minor_shape[mv_num]);
+ printf("direction: %u\n", direction[mv_num]);
+ std::cout<<std::endl;
+ }
+ }
+#endif
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ OCL_ASSERT(mv[mv_num*2] == expected[mv_num*2]);
+ OCL_ASSERT(mv[mv_num*2 + 1] == expected[mv_num*2 + 1]);
+ }
+ }
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
+ OCL_UNMAP_BUFFER(6);
+ OCL_UNMAP_BUFFER(7);
+ OCL_UNMAP_BUFFER(8);
+
+ free(image_data1);
+ free(image_data2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_block_motion_estimate_intel);
diff --git a/utests/compiler_intra_prediction.cpp b/utests/compiler_intra_prediction.cpp
new file mode 100644
index 0000000..55fc02d
--- /dev/null
+++ b/utests/compiler_intra_prediction.cpp
@@ -0,0 +1,116 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+void compiler_intra_prediction(void)
+{
+ if (!cl_check_device_side_avc_motion_estimation()) {
+ return;
+ }
+
+ OCL_CREATE_KERNEL("compiler_intra_prediction");
+
+ const size_t w = 80;
+ const size_t h = 48;
+ const size_t mv_w = (w + 15) / 16;
+ const size_t mv_h = (h + 15) / 16;
+
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ uint8_t* image_data1 = (uint8_t *)malloc(w * h); //src
+ for (size_t j = 0; j < h; j++) {
+ for (size_t i = 0; i < w; i++) {
+ if (i >= 32 && i <= 47 && j >= 16 && j <= 31)
+ image_data1[w * j + i] = 2;
+ else
+ image_data1[w * j + i] = 1;
+ }
+ }
+
+ format.image_channel_order = CL_R;
+ format.image_channel_data_type = CL_UNORM_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1); //src
+
+ OCL_CREATE_BUFFER(buf[1], 0, mv_w * mv_h * sizeof(uint8_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, mv_w * mv_h * sizeof(uint16_t), NULL);
+ OCL_CREATE_BUFFER(buf[3], 0, mv_w * mv_h * sizeof(uint8_t), NULL);
+ OCL_CREATE_BUFFER(buf[4], 0, mv_w * mv_h * sizeof(uint32_t) * 16 * 8, NULL);
+ OCL_CREATE_BUFFER(buf[5], 0, mv_w * mv_h * sizeof(uint32_t) * 8 * 8, NULL);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+
+ globals[0] = w;
+ globals[1] = h / 16;
+ locals[0] = 16;
+ locals[1] = 1;
+ OCL_NDRANGE(2);
+
+ OCL_MAP_BUFFER(1);
+ OCL_MAP_BUFFER(2);
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+ uint8_t* modes = (uint8_t*)buf_data[1];
+ uint16_t* residual = (uint16_t*)buf_data[2];
+ uint8_t* shape = (uint8_t*)buf_data[3];
+ uint32_t* dwo = (uint32_t*)buf_data[4];
+ uint32_t* pld = (uint32_t*)buf_data[5];
+#define VME_DEBUG 0
+#if VME_DEBUG
+ std::cout<<std::endl;
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ std::cout<<"******* mv num = "<<mv_num<<": "<<std::endl;
+ std::cout<<"payload register result: "<<std::endl;
+ for(uint32_t row_num = 0; row_num < 8; row_num++){
+ for(int32_t idx = 7; idx >= 0; idx--)
+ printf("%.8x ", pld[mv_num*64 + row_num*8 + idx]);
+ printf("\n");
+ }
+ std::cout<<std::endl;
+ std::cout<<"writeback register result: "<<std::endl;
+ for(uint32_t row_num = 0; row_num < 4; row_num++){
+ for(int32_t wi = 7; wi >= 0; wi--)
+ printf("%.8x ", dwo[mv_num*16*4 + row_num*16 + wi]);
+ printf("\n");
+ for(int32_t wi = 15; wi >= 8; wi--)
+ printf("%.8x ", dwo[mv_num*16*4 + row_num*16 + wi]);
+ printf("\n");
+ }
+ std::cout<<std::endl;
+ printf("modes: %u\n", modes[mv_num]);
+ std::cout<<std::endl;
+ std::cout<<"residual: "<<residual[mv_num]<<std::endl;
+ std::cout<<std::endl;
+ printf("shape: %u\n", shape[mv_num]);
+ std::cout<<std::endl;
+ }
+ }
+#endif
+ OCL_ASSERT(modes[7] == 2);
+ OCL_ASSERT(residual[7] == 266);
+ OCL_ASSERT(shape[7] == 0);
+
+ OCL_UNMAP_BUFFER(1);
+ OCL_UNMAP_BUFFER(2);
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
+
+ free(image_data1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_intra_prediction);
diff --git a/utests/compiler_skip_check.cpp b/utests/compiler_skip_check.cpp
new file mode 100644
index 0000000..84d9058
--- /dev/null
+++ b/utests/compiler_skip_check.cpp
@@ -0,0 +1,190 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+const size_t w = 80;
+const size_t h = 48;
+const size_t mv_w = (w + 15) / 16;
+const size_t mv_h = (h + 15) / 16;
+
+void cpu_result(uint8_t* srcImg, uint8_t* refImg, int16_t* mv, uint16_t* residual){
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ int16_t mv_x = mv[mv_num*2] >> 2;
+ int16_t mv_y = mv[mv_num*2 + 1] >> 2;
+ int16_t src_mb_x = i * 16;
+ int16_t src_mb_y = j * 16;
+ int16_t ref_mb_x = src_mb_x + mv_x;
+ int16_t ref_mb_y = src_mb_y + mv_y;
+
+ uint16_t res = 0;
+ int16_t sy = src_mb_y, ry = ref_mb_y;
+ for(uint32_t a = 0; a < 16; a++){
+ int16_t sx = src_mb_x;
+ int16_t rx = ref_mb_x;
+ for(uint32_t b = 0; b < 16; b++){
+ uint8_t src_pixel = srcImg[sy*w + sx];
+ uint8_t ref_pixel = refImg[ry*w + rx];
+ res += abs(src_pixel - ref_pixel);
+ sx++;
+ rx++;
+ }
+ sy++;
+ ry++;
+ }
+ residual[mv_num] = res;
+ }
+ }
+}
+
+void compiler_skip_check(void)
+{
+ if (!cl_check_device_side_avc_motion_estimation()) {
+ return;
+ }
+
+ OCL_CREATE_KERNEL("compiler_skip_check");
+
+ cl_image_format format;
+ cl_image_desc desc;
+
+ memset(&desc, 0x0, sizeof(cl_image_desc));
+ memset(&format, 0x0, sizeof(cl_image_format));
+
+ uint8_t* image_data1 = (uint8_t *)malloc(w * h); //src
+ uint8_t* image_data2 = (uint8_t *)malloc(w * h); //ref
+ for (size_t j = 0; j < h; j++) {
+ for (size_t i = 0; i < w; i++) {
+ if (i >= 32 && i <= 47 && j >= 16 && j <= 31)
+ image_data1[w * j + i] = 100;
+ else
+ image_data1[w * j + i] = j + i;
+ if (i >= 33 && i <= 48 && j >= 14 && j <= 29)
+ image_data2[w * j + i] = 99;
+ else
+ image_data2[w * j + i] = (h-1) + (w-1) - (j+i);
+ }
+ }
+
+
+ format.image_channel_order = CL_R;
+ format.image_channel_data_type = CL_UNORM_INT8;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = w;
+ desc.image_height = h;
+ desc.image_row_pitch = 0;
+ OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1); //src
+ OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2); //ref
+
+ int16_t* input_mv = (int16_t*) malloc(mv_w * mv_h * sizeof(int16_t) * 2);
+ //Generate input mv data
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ if(i == 32/16 && j == 16/16){
+ input_mv[mv_num*2 ] = 1;
+ input_mv[mv_num*2 + 1 ] = -2;
+ }
+ else{
+ input_mv[mv_num*2 ] = (mv_num) % 2;
+ input_mv[mv_num*2 + 1 ] = (mv_num) % 3;
+ if(i == mv_w - 1)
+ input_mv[mv_num*2 ] *= -1;
+ if(j == mv_h - 1)
+ input_mv[mv_num*2 + 1 ] *= -1;
+ }
+ input_mv[mv_num*2 ] <<= 2;
+ input_mv[mv_num*2 + 1 ] <<= 2;
+ }
+ }
+
+ uint16_t* cpu_resi = (uint16_t*) malloc(mv_w * mv_h * sizeof(uint16_t));
+ cpu_result(image_data1, image_data2, input_mv, cpu_resi);
+
+ OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, mv_w * mv_h * sizeof(int16_t) * 2, input_mv);
+ OCL_CREATE_BUFFER(buf[3], 0, mv_w * mv_h * sizeof(uint16_t), NULL);
+ OCL_CREATE_BUFFER(buf[4], 0, mv_w * mv_h * sizeof(uint32_t) * 16 * 8, NULL);
+ OCL_CREATE_BUFFER(buf[5], 0, mv_w * mv_h * sizeof(uint32_t) * 8 * 8, NULL);
+
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+ OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+ OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+
+ globals[0] = w;
+ globals[1] = h / 16;
+ locals[0] = 16;
+ locals[1] = 1;
+ OCL_NDRANGE(2);
+
+ OCL_MAP_BUFFER(3);
+ OCL_MAP_BUFFER(4);
+ OCL_MAP_BUFFER(5);
+ uint16_t* residual = (uint16_t*)buf_data[3];
+ uint32_t* dwo = (uint32_t*)buf_data[4];
+ uint32_t* pld = (uint32_t*)buf_data[5];
+#define VME_DEBUG 0
+#if VME_DEBUG
+ std::cout<<std::endl;
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ std::cout<<"******* mv num = "<<mv_num<<": "<<std::endl;
+ std::cout<<"payload register result: "<<std::endl;
+ for(uint32_t row_num = 0; row_num < 8; row_num++){
+ for(int32_t idx = 7; idx >= 0; idx--)
+ printf("%.8x ", pld[mv_num*64 + row_num*8 + idx]);
+ printf("\n");
+ }
+ std::cout<<std::endl;
+ std::cout<<"writeback register result: "<<std::endl;
+ for(uint32_t row_num = 0; row_num < 4; row_num++){
+ for(int32_t wi = 7; wi >= 0; wi--)
+ printf("%.8x ", dwo[mv_num*16*4 + row_num*16 + wi]);
+ printf("\n");
+ for(int32_t wi = 15; wi >= 8; wi--)
+ printf("%.8x ", dwo[mv_num*16*4 + row_num*16 + wi]);
+ printf("\n");
+ }
+ std::cout<<std::endl;
+ std::cout<<"residual: ";
+ std::cout<<residual[mv_num]<<std::endl;
+ }
+ }
+ std::cout<<"cpu residual: "<<std::endl;
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ std::cout<<cpu_resi[mv_num]<<" ";
+ }
+ }
+ std::cout<<std::endl;
+ std::cout<<"gpu residual: "<<std::endl;
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ std::cout<<residual[mv_num]<<" ";
+ }
+ }
+#endif
+ std::cout<<std::endl;
+ for (uint32_t j = 0; j <= mv_h - 1; ++j) {
+ for (uint32_t i = 0; i <= mv_w - 1; ++i) {
+ uint32_t mv_num = j * mv_w + i;
+ OCL_ASSERT(cpu_resi[mv_num] == residual[mv_num]);
+ }
+ }
+
+ OCL_UNMAP_BUFFER(3);
+ OCL_UNMAP_BUFFER(4);
+ OCL_UNMAP_BUFFER(5);
+
+ free(image_data1);
+ free(image_data2);
+ free(input_mv);
+ free(cpu_resi);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_skip_check);
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index f4487c1..862b1d3 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -895,6 +895,24 @@ int cl_check_motion_estimation(void)
return 1;
}
+int cl_check_device_side_avc_motion_estimation(void)
+{
+ std::string extStr;
+ size_t param_value_size;
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, ¶m_value_size);
+ std::vector<char> param_value(param_value_size);
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+ param_value.empty() ? NULL : ¶m_value.front(), ¶m_value_size);
+ if (!param_value.empty())
+ extStr = std::string(¶m_value.front(), param_value_size-1);
+
+ if (std::strstr(extStr.c_str(), "cl_intel_device_side_avc_motion_estimation") == NULL) {
+ printf("No cl_intel_device_side_avc_motion_estimation, Skip!");
+ return 0;
+ }
+ return 1;
+}
+
int cl_check_subgroups(void)
{
std::string extStr;
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 5dc381e..e19a95d 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -315,6 +315,9 @@ extern clGetKernelSubGroupInfoKHR_cb* utestclGetKernelSubGroupInfoKHR;
/* Check if cl_intel_motion_estimation enabled. */
extern int cl_check_motion_estimation(void);
+/* Check if cl_intel_device_side_avc_motion_estimation enabled. */
+extern int cl_check_device_side_avc_motion_estimation(void);
+
/* Check is cl version 2.0 or Beignet extension. */
extern int cl_check_ocl20(bool or_beignet = true);
--
2.7.4
More information about the Beignet
mailing list