[Beignet] [PATCH v2] support 64bit-integer addition, subtraction

Homer Hsing homer.xing at intel.com
Tue Aug 6 00:41:40 PDT 2013


also enable GPU command "subb" (subtract with borrow)

also add test cases

v2: renamed GEN_TYPE_UQ/GEN_TYPE_Q to GEN_TYPE_UL/GEN_TYPE_L

Signed-off-by: Homer Hsing <homer.xing at intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c  |   1 +
 backend/src/backend/gen_context.cpp        | 130 +++++++++++++++++++++++++++++
 backend/src/backend/gen_context.hpp        |  11 +++
 backend/src/backend/gen_defs.hpp           |   3 +
 backend/src/backend/gen_encoder.cpp        |   7 ++
 backend/src/backend/gen_encoder.hpp        |   1 +
 backend/src/backend/gen_insn_selection.cpp |  22 ++++-
 backend/src/backend/gen_insn_selection.hxx |   2 +
 backend/src/backend/gen_register.hpp       |  22 +++++
 kernels/compiler_long.cl                   |   7 ++
 utests/CMakeLists.txt                      |   1 +
 utests/compiler_long.cpp                   |  58 +++++++++++++
 12 files changed, 262 insertions(+), 3 deletions(-)
 create mode 100644 kernels/compiler_long.cl
 create mode 100644 utests/compiler_long.cpp

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index ce20e44..7d77e7a 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -83,6 +83,7 @@ static const struct {
   [GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SUBB] = { .name = "subb", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 709122a..7a33b79 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -175,12 +175,142 @@ namespace gbe
     }
   }
 
+  void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    src = src.top_half();
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+    if (execWidth == 16) {
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+    }
+    p->pop();
+  }
+
+  void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    dest = dest.top_half();
+    p->push();
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->curr.nibControl = 1;
+    p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+    if (execWidth == 16) {
+      p->curr.quarterControl = 1;
+      p->curr.nibControl = 0;
+      p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+      p->curr.nibControl = 1;
+      p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+    }
+    p->pop();
+  }
+
+  void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    src = src.bottom_half();
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+    if (execWidth == 16) {
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+    }
+    p->pop();
+  }
+
+  void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
+    int execWidth = p->curr.execWidth;
+    dest = dest.bottom_half();
+    p->push();
+    p->curr.execWidth = 8;
+    p->MOV(dest, src);
+    p->curr.nibControl = 1;
+    p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+    if (execWidth == 16) {
+      p->curr.quarterControl = 1;
+      p->curr.nibControl = 0;
+      p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+      p->curr.nibControl = 1;
+      p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+    }
+    p->pop();
+  }
+
+  void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
+    int execWidth = p->curr.execWidth;
+    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->ADDC(dest, src0, src1);
+    p->MOV(src1, acc0);
+    if (execWidth == 16) {
+      p->ADDC(GenRegister::suboffset(dest, 8),
+              GenRegister::suboffset(src0, 8),
+              GenRegister::suboffset(src1, 8));
+      p->MOV(GenRegister::suboffset(src1, 8), acc0);
+    }
+    p->pop();    
+  }
+
+  void GenContext::subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1) {
+    int execWidth = p->curr.execWidth;
+    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.execWidth = 8;
+    p->SUBB(dest, src0, src1);
+    p->MOV(src1, acc0);
+    if (execWidth == 16) {
+      p->SUBB(GenRegister::suboffset(dest, 8),
+              GenRegister::suboffset(src0, 8),
+              GenRegister::suboffset(src1, 8));
+      p->MOV(GenRegister::suboffset(src1, 8), acc0);
+    }
+    p->pop();    
+  }
+
   void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src0 = ra->genReg(insn.src(0));
     const GenRegister src1 = ra->genReg(insn.src(1));
     const GenRegister src2 = ra->genReg(insn.src(2));
     switch (insn.opcode) {
+      case SEL_OP_I64ADD:
+        {
+          GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
+                      y = GenRegister::suboffset(x, p->curr.execWidth);
+          loadBottomHalf(x, src0);
+          loadBottomHalf(y, src1);
+          addWithCarry(x, x, y);
+          storeBottomHalf(dst, x);
+          loadTopHalf(x, src0);
+          p->ADD(x, x, y);
+          loadTopHalf(y, src1);
+          p->ADD(x, x, y);
+          storeTopHalf(dst, x);
+        }
+        break;
+      case SEL_OP_I64SUB:
+        {
+          GenRegister x = GenRegister::retype(src2, GEN_TYPE_UD),
+                      y = GenRegister::suboffset(x, p->curr.execWidth);
+          loadBottomHalf(x, src0);
+          loadBottomHalf(y, src1);
+          subWithBorrow(x, x, y);
+          storeBottomHalf(dst, x);
+          loadTopHalf(x, src0);
+          subWithBorrow(x, x, y);
+          loadTopHalf(y, src1);
+          subWithBorrow(x, x, y);
+          storeTopHalf(dst, x);
+        }
+        break;
       case SEL_OP_MUL_HI:
        {
         int w = p->curr.execWidth;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 82d41b6..dc5dc45 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -27,6 +27,7 @@
 
 #include "backend/context.hpp"
 #include "backend/program.h"
+#include "backend/gen_register.hpp"
 #include "ir/function.hpp"
 #include "ir/liveness.hpp"
 #include "sys/map.hpp"
@@ -73,6 +74,16 @@ namespace gbe
     INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
       return this->liveness->getLiveOut(bb);
     }
+
+    void loadTopHalf(GenRegister dest, GenRegister src);
+    void storeTopHalf(GenRegister dest, GenRegister src);
+
+    void loadBottomHalf(GenRegister dest, GenRegister src);
+    void storeBottomHalf(GenRegister dest, GenRegister src);
+
+    void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
+    void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
     void emitUnaryInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 5a9bb2d..5b15e30 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -157,6 +157,7 @@ enum opcode {
   GEN_OPCODE_FBH = 75,
   GEN_OPCODE_FBL = 76,
   GEN_OPCODE_ADDC = 78,
+  GEN_OPCODE_SUBB = 79,
   GEN_OPCODE_SAD2 = 80,
   GEN_OPCODE_SADA2 = 81,
   GEN_OPCODE_DP4 = 84,
@@ -242,6 +243,8 @@ enum GenMessageTarget {
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
 #define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
+#define GEN_TYPE_UL  8
+#define GEN_TYPE_L   9
 
 #define GEN_ARF_NULL                  0x00
 #define GEN_ARF_ADDRESS               0x10
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 70e542f..64b5bd1 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -882,6 +882,13 @@ namespace gbe
   ALU2(MACH)
   ALU3(MAD)
 
+  void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
+    push();
+    curr.accWrEnable = 1;
+    alu2(this, GEN_OPCODE_SUBB, dest, src0, src1);
+    pop();
+  }
+
   void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
     push();
     curr.accWrEnable = 1;
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index 054c343..083bd8c 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -92,6 +92,7 @@ namespace gbe
     ALU1(MOV)
     ALU1(FBH)
     ALU1(FBL)
+    ALU2(SUBB)
     ALU2(UPSAMPLE_SHORT)
     ALU2(UPSAMPLE_INT)
     ALU1(RNDZ)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index f3baa6d..eca62b4 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -128,6 +128,8 @@ namespace gbe
       case TYPE_U16: return GEN_TYPE_UW;
       case TYPE_S32: return GEN_TYPE_D;
       case TYPE_U32: return GEN_TYPE_UD;
+      case TYPE_S64: return GEN_TYPE_L;
+      case TYPE_U64: return GEN_TYPE_UL;
       case TYPE_FLOAT: return GEN_TYPE_F;
       case TYPE_DOUBLE: return GEN_TYPE_DF;
       default: NOT_SUPPORTED; return GEN_TYPE_F;
@@ -426,6 +428,8 @@ namespace gbe
     ALU2(RSL)
     ALU2(ASR)
     ALU2(ADD)
+    ALU3(I64ADD)
+    ALU3(I64SUB)
     ALU2(MUL)
     ALU1(FRC)
     ALU1(RNDD)
@@ -1193,7 +1197,7 @@ namespace gbe
     using namespace ir;
     const auto &childInsn = cast<LoadImmInstruction>(insn);
     const auto &imm = childInsn.getImmediate();
-    if(imm.type != TYPE_DOUBLE)
+    if(imm.type != TYPE_DOUBLE && imm.type != TYPE_S64 && imm.type != TYPE_U64)
       return true;
     return false;
   }
@@ -1416,7 +1420,13 @@ namespace gbe
 
       // Output the binary instruction
       switch (opcode) {
-        case OP_ADD: sel.ADD(dst, src0, src1); break;
+        case OP_ADD:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+            sel.I64ADD(dst, src0, src1, t);
+          } else
+            sel.ADD(dst, src0, src1);
+          break;
         case OP_ADDSAT:
           sel.push();
             sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
@@ -1426,7 +1436,13 @@ namespace gbe
         case OP_XOR: sel.XOR(dst, src0, src1); break;
         case OP_OR:  sel.OR(dst, src0,  src1); break;
         case OP_AND: sel.AND(dst, src0, src1); break;
-        case OP_SUB: sel.ADD(dst, src0, GenRegister::negate(src1)); break;
+        case OP_SUB:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+            sel.I64SUB(dst, src0, src1, t);
+          } else
+            sel.ADD(dst, src0, GenRegister::negate(src1));
+          break;
         case OP_SUBSAT:
           sel.push();
             sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index c6aede5..f2b86c4 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -19,6 +19,8 @@ DECL_SELECTION_IR(RSR, BinaryInstruction)
 DECL_SELECTION_IR(RSL, BinaryInstruction)
 DECL_SELECTION_IR(ASR, BinaryInstruction)
 DECL_SELECTION_IR(ADD, BinaryInstruction)
+DECL_SELECTION_IR(I64ADD, TernaryInstruction)
+DECL_SELECTION_IR(I64SUB, TernaryInstruction)
 DECL_SELECTION_IR(MUL, BinaryInstruction)
 DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
 DECL_SELECTION_IR(MACH, BinaryInstruction)
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 7e48837..fda2e6c 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -70,6 +70,8 @@ namespace gbe
   INLINE int typeSize(uint32_t type) {
     switch(type) {
       case GEN_TYPE_DF:
+      case GEN_TYPE_UL:
+      case GEN_TYPE_L:
         return 8;
       case GEN_TYPE_UD:
       case GEN_TYPE_D:
@@ -222,12 +224,32 @@ namespace gbe
       return r;
     }
 
+    INLINE bool isint64(void) const {
+      if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
     INLINE bool isimmdf(void) const {
       if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
         return true;
       return false;
     }
 
+    INLINE GenRegister top_half(void) const {
+      GenRegister r = bottom_half();
+      r.subnr += 4;
+      return r;
+    }
+
+    INLINE GenRegister bottom_half(void) const {
+      GBE_ASSERT(isint64());
+      GenRegister r = *this;
+      r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
+      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
     INLINE bool isdf(void) const {
       if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
         return true;
diff --git a/kernels/compiler_long.cl b/kernels/compiler_long.cl
new file mode 100644
index 0000000..3087292
--- /dev/null
+++ b/kernels/compiler_long.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  if(i < 5)
+    dst[i] = src1[i] + src2[i];
+  if(i > 5)
+    dst[i] = src1[i] - src2[i];
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index be565ea..a0dafa2 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -121,6 +121,7 @@ set (utests_sources
   compiler_double_2.cpp
   compiler_double_3.cpp
   compiler_double_4.cpp
+  compiler_long.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
new file mode 100644
index 0000000..fad2744
--- /dev/null
+++ b/utests/compiler_long.cpp
@@ -0,0 +1,58 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  src1[0] = -1L,                  src2[0] = -1L;
+  src1[1] = 0x8000000000000000UL, src2[1] = 0x8000000000000000UL;
+  src1[2] = 0x7FFFFFFFFFFFFFFFL,  src2[2] = 1L;
+  src1[3] = 0xFFFFFFFEL,          src2[3] = 1L;
+  src1[4] = 0x7FFFFFFFL,          src2[4] = 0x80000000L;
+  src1[5] = 0,                    src2[5] = 0;
+  src1[6] = 0,                    src2[6] = 1;
+  src1[7] = -2L,                  src2[7] = -1L;
+  src1[8] = 0,                    src2[8] = 0x8000000000000000UL;
+  for (int32_t i = 9; i < (int32_t) n; ++i) {
+    src1[i] = ((long)rand() << 32) + rand();
+    src2[i] = ((long)rand() << 32) + rand();
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+    if (i < 5)
+      OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+    if (i > 5)
+      OCL_ASSERT(src1[i] - src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long);
-- 
1.8.1.2



More information about the Beignet mailing list