[Beignet] [PATCH] support 64bit-integer multiplication

Homer Hsing homer.xing at intel.com
Mon Aug 12 20:05:28 PDT 2013


also add test case

Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
 backend/src/backend/gen_context.cpp                | 46 ++++++++++++++++++++
 backend/src/backend/gen_context.hpp                |  3 ++
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |  1 +
 backend/src/backend/gen_insn_selection.cpp         | 23 +++++++---
 backend/src/backend/gen_insn_selection.hxx         |  1 +
 kernels/compiler_long_mult.cl                      |  7 ++++
 utests/CMakeLists.txt                              |  1 +
 utests/compiler_long_mult.cpp                      | 49 ++++++++++++++++++++++
 8 files changed, 126 insertions(+), 5 deletions(-)
 create mode 100644 kernels/compiler_long_mult.cl
 create mode 100644 utests/compiler_long_mult.cpp

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 406cb80..86f3555 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -598,6 +598,52 @@ namespace gbe
     p->pop();
   }
 
+  void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
+    GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
+    int execWidth = p->curr.execWidth;
+    p->push();
+    p->curr.execWidth = 8;
+    for(int i = 0; i < execWidth; i += 8) {
+      p->MUL(acc, src0, src1);
+      p->curr.accWrEnable = 1;
+      p->MACH(high, src0, src1);
+      p->curr.accWrEnable = 0;
+      p->MOV(low, acc);
+      src0 = GenRegister::suboffset(src0, 8);
+      src1 = GenRegister::suboffset(src1, 8);
+      high = GenRegister::suboffset(high, 8);
+      low = GenRegister::suboffset(low, 8);
+    }
+    p->pop();
+  }
+
+  void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    I32FullMult(GenRegister::null(), e, b, c);
+    I32FullMult(GenRegister::null(), f, a, d);
+    p->ADD(e, e, f);
+    I32FullMult(f, a, b, d);
+    p->ADD(e, e, f);
+    p->pop();
+    storeTopHalf(dest, e);
+    storeBottomHalf(dest, a);
+  }
+
   void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index b294042..1531961 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -86,6 +86,7 @@ namespace gbe
 
     void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
     void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+    void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
@@ -116,8 +117,10 @@ namespace gbe
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn);
     void emitGetImageInfoInstruction(const SelectionInstruction &insn);
+    void emitI64MULInstruction(const SelectionInstruction &insn);
     void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
     void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+
     /*! Implements base class */
     virtual Kernel *allocateKernel(void);
     /*! Store the position of each label instruction in the Gen ISA stream */
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 4879b66..7f214ac 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -27,3 +27,4 @@ DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
 DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
+DECL_GEN7_SCHEDULE(I64MUL,          20,        4,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 929a3bd..9e3c535 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -523,6 +523,8 @@ namespace gbe
     void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+    /*! Multiply 64-bit integers */
+    void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
@@ -1003,6 +1005,15 @@ namespace gbe
     insn->extra.function = function;
   }
 
+  void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 6; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
@@ -1610,12 +1621,14 @@ namespace gbe
           if (type == TYPE_U32 || type == TYPE_S32) {
             sel.pop();
             return false;
-          }
-          else {
-            GBE_ASSERTM((type != TYPE_S64 && type != TYPE_U64), "64bit integer not supported yet!" );
+          } else if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.I64MUL(dst, src0, src1, tmp);
+          } else
             sel.MUL(dst, src0, src1);
-          }
-        break;
+          break;
         case OP_HADD: {
             GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
             sel.HADD(dst, src0, src1, temp);
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 06469ca..6ef50b8 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -30,6 +30,7 @@ DECL_SELECTION_IR(ADD, BinaryInstruction)
 DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
 DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(I64MUL, I64MULInstruction)
 DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
 DECL_SELECTION_IR(CMP, CompareInstruction)
diff --git a/kernels/compiler_long_mult.cl b/kernels/compiler_long_mult.cl
new file mode 100644
index 0000000..5b96d74
--- /dev/null
+++ b/kernels/compiler_long_mult.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_mult(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  if(i < 3)
+    dst[i] = src1[i] + src2[i];
+  else
+    dst[i] = src1[i] * src2[i];
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b3d039e..746d77b 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -129,6 +129,7 @@ set (utests_sources
   compiler_long_shl.cpp
   compiler_long_shr.cpp
   compiler_long_asr.cpp
+  compiler_long_mult.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/compiler_long_mult.cpp b/utests/compiler_long_mult.cpp
new file mode 100644
index 0000000..06070f7
--- /dev/null
+++ b/utests/compiler_long_mult.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_mult(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_mult");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = 0x77665544FFEEDDCCLL;
+    src2[i] = ((int64_t)rand() << 32) + rand();
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+    if (i < 3)
+      OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+    else
+      OCL_ASSERT(src1[i] * src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mult);
-- 
1.8.1.2



More information about the Beignet mailing list