[Beignet] [PATCH] support 64bit-integer multiplication
Homer Hsing
homer.xing at intel.com
Mon Aug 12 20:05:28 PDT 2013
also add test case
Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
backend/src/backend/gen_context.cpp | 46 ++++++++++++++++++++
backend/src/backend/gen_context.hpp | 3 ++
.../src/backend/gen_insn_gen7_schedule_info.hxx | 1 +
backend/src/backend/gen_insn_selection.cpp | 23 +++++++---
backend/src/backend/gen_insn_selection.hxx | 1 +
kernels/compiler_long_mult.cl | 7 ++++
utests/CMakeLists.txt | 1 +
utests/compiler_long_mult.cpp | 49 ++++++++++++++++++++++
8 files changed, 126 insertions(+), 5 deletions(-)
create mode 100644 kernels/compiler_long_mult.cl
create mode 100644 utests/compiler_long_mult.cpp
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 406cb80..86f3555 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -598,6 +598,52 @@ namespace gbe
p->pop();
}
+ void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
+ GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
+ int execWidth = p->curr.execWidth;
+ p->push();
+ p->curr.execWidth = 8;
+ for(int i = 0; i < execWidth; i += 8) {
+ p->MUL(acc, src0, src1);
+ p->curr.accWrEnable = 1;
+ p->MACH(high, src0, src1);
+ p->curr.accWrEnable = 0;
+ p->MOV(low, acc);
+ src0 = GenRegister::suboffset(src0, 8);
+ src1 = GenRegister::suboffset(src1, 8);
+ high = GenRegister::suboffset(high, 8);
+ low = GenRegister::suboffset(low, 8);
+ }
+ p->pop();
+ }
+
+ void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
+ GenRegister dest = ra->genReg(insn.dst(0));
+ GenRegister x = ra->genReg(insn.src(0));
+ GenRegister y = ra->genReg(insn.src(1));
+ GenRegister a = ra->genReg(insn.dst(1));
+ GenRegister b = ra->genReg(insn.dst(2));
+ GenRegister c = ra->genReg(insn.dst(3));
+ GenRegister d = ra->genReg(insn.dst(4));
+ GenRegister e = ra->genReg(insn.dst(5));
+ GenRegister f = ra->genReg(insn.dst(6));
+ a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+ loadTopHalf(a, x);
+ loadBottomHalf(b, x);
+ loadTopHalf(c, y);
+ loadBottomHalf(d, y);
+ p->push();
+ p->curr.predicate = GEN_PREDICATE_NONE;
+ I32FullMult(GenRegister::null(), e, b, c);
+ I32FullMult(GenRegister::null(), f, a, d);
+ p->ADD(e, e, f);
+ I32FullMult(f, a, b, d);
+ p->ADD(e, e, f);
+ p->pop();
+ storeTopHalf(dest, e);
+ storeBottomHalf(dest, a);
+ }
+
void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
const GenRegister dst = ra->genReg(insn.dst(0));
const GenRegister src0 = ra->genReg(insn.src(0));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index b294042..1531961 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -86,6 +86,7 @@ namespace gbe
void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+ void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
/*! Final Gen ISA emission helper functions */
void emitLabelInstruction(const SelectionInstruction &insn);
@@ -116,8 +117,10 @@ namespace gbe
void emitSpillRegInstruction(const SelectionInstruction &insn);
void emitUnSpillRegInstruction(const SelectionInstruction &insn);
void emitGetImageInfoInstruction(const SelectionInstruction &insn);
+ void emitI64MULInstruction(const SelectionInstruction &insn);
void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+
/*! Implements base class */
virtual Kernel *allocateKernel(void);
/*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 4879b66..7f214ac 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -27,3 +27,4 @@ DECL_GEN7_SCHEDULE(SpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(UnSpillReg, 80, 1, 1)
DECL_GEN7_SCHEDULE(GetImageInfo, 20, 4, 2)
DECL_GEN7_SCHEDULE(Atomic, 80, 1, 1)
+DECL_GEN7_SCHEDULE(I64MUL, 20, 4, 2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 929a3bd..9e3c535 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -523,6 +523,8 @@ namespace gbe
void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
/*! Get image information */
void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+ /*! Multiply 64-bit integers */
+ void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
/*! Use custom allocators */
GBE_CLASS(Opaque);
friend class SelectionBlock;
@@ -1003,6 +1005,15 @@ namespace gbe
insn->extra.function = function;
}
+ void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+ SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+ insn->dst(0) = dst;
+ insn->src(0) = src0;
+ insn->src(1) = src1;
+ for(int i = 0; i < 6; i++)
+ insn->dst(i + 1) = tmp[i];
+ }
+
void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
insn->dst(0) = dst;
@@ -1610,12 +1621,14 @@ namespace gbe
if (type == TYPE_U32 || type == TYPE_S32) {
sel.pop();
return false;
- }
- else {
- GBE_ASSERTM((type != TYPE_S64 && type != TYPE_U64), "64bit integer not supported yet!" );
+ } else if (type == TYPE_S64 || type == TYPE_U64) {
+ GenRegister tmp[6];
+ for(int i = 0; i < 6; i++)
+ tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+ sel.I64MUL(dst, src0, src1, tmp);
+ } else
sel.MUL(dst, src0, src1);
- }
- break;
+ break;
case OP_HADD: {
GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
sel.HADD(dst, src0, src1, temp);
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 06469ca..6ef50b8 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -30,6 +30,7 @@ DECL_SELECTION_IR(ADD, BinaryInstruction)
DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(I64MUL, I64MULInstruction)
DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
DECL_SELECTION_IR(MACH, BinaryInstruction)
DECL_SELECTION_IR(CMP, CompareInstruction)
diff --git a/kernels/compiler_long_mult.cl b/kernels/compiler_long_mult.cl
new file mode 100644
index 0000000..5b96d74
--- /dev/null
+++ b/kernels/compiler_long_mult.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_mult(global long *src1, global long *src2, global long *dst) {
+ int i = get_global_id(0);
+ if(i < 3)
+ dst[i] = src1[i] + src2[i];
+ else
+ dst[i] = src1[i] * src2[i];
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b3d039e..746d77b 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -129,6 +129,7 @@ set (utests_sources
compiler_long_shl.cpp
compiler_long_shr.cpp
compiler_long_asr.cpp
+ compiler_long_mult.cpp
utest_assert.cpp
utest.cpp
utest_file_map.cpp
diff --git a/utests/compiler_long_mult.cpp b/utests/compiler_long_mult.cpp
new file mode 100644
index 0000000..06070f7
--- /dev/null
+++ b/utests/compiler_long_mult.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_mult(void)
+{
+ const size_t n = 16;
+ int64_t src1[n], src2[n];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_long_mult");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+ OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+ globals[0] = n;
+ locals[0] = 16;
+
+ // Run random tests
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ src1[i] = 0x77665544FFEEDDCCLL;
+ src2[i] = ((int64_t)rand() << 32) + rand();
+ }
+ OCL_MAP_BUFFER(0);
+ OCL_MAP_BUFFER(1);
+ memcpy(buf_data[0], src1, sizeof(src1));
+ memcpy(buf_data[1], src2, sizeof(src2));
+ OCL_UNMAP_BUFFER(0);
+ OCL_UNMAP_BUFFER(1);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Compare
+ OCL_MAP_BUFFER(2);
+ for (int32_t i = 0; i < (int32_t) n; ++i) {
+ //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+ if (i < 3)
+ OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+ else
+ OCL_ASSERT(src1[i] * src2[i] == ((int64_t *)buf_data[2])[i]);
+ }
+ OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mult);
--
1.8.1.2
More information about the Beignet
mailing list