[Beignet] [PATCH] Fit the printf bug in loop

junyan.he at inbox.com junyan.he at inbox.com
Mon Oct 13 00:57:06 PDT 2014


From: Junyan He <junyan.he at linux.intel.com>

The static analyse for printf can not totally work
when the printf inst is within the loop and the loop
can not be unrolled. This causes the printf just to
print one info for a loop and to lose all the others.
We now increment the exec number every time the printf
inst is triggered. The number is stored for output all
the message later.
The problem is that we can not caculate the exact loops
number for each printf inst. The wrong loop number will
cause the data overwritten. We now assume all the printf
inst are in loop and store the data like this:
| PRINTF1_DATA  PRINTF2_DATA ... | PRINTF1_DATA  PRINTF2_DATA ... | ...
|       DATA_LOOP_ONE            |          DATA_LOOP_TWO         | ...
Although this may cause some space wasted.

Another problem is that we need to decide the size of printf buffer
because the loop upbound can not be caculated. We just set
it yo 1M for small info slot request and 4M for big one.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/ir/printf.cpp               |   13 +-
 backend/src/llvm/llvm_printf_parser.cpp |  338 ++++++++++++++++++++-----------
 kernels/test_printf.cl                  |   27 ++-
 src/cl_command_queue_gen7.c             |   10 +-
 4 files changed, 256 insertions(+), 132 deletions(-)

diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index e99aad5..093bfc6 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -106,8 +106,9 @@ namespace gbe
       if (!vec_i)                                                       \
         pf_str = pf_str + std::string(#conv);                           \
       printf(pf_str.c_str(),                                            \
-             ((target_ty *)((char *)buf_addr + slot.state->out_buf_sizeof_offset * \
-                            global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
+             ((target_ty *)((char *)buf_addr + sizeOfSize * global_wk_sz0 * global_wk_sz1 * global_wk_sz2 * n \
+                                              + slot.state->out_buf_sizeof_offset * \
+                                                         global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
              [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
     } while (0)
 
@@ -124,10 +125,9 @@ namespace gbe
         for (i = 0; i < global_wk_sz0; i++) {
           for (j = 0; j < global_wk_sz1; j++) {
             for (k = 0; k < global_wk_sz2; k++) {
-
-              int flag = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
-                                             + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
-              if (flag) {
+              int loop_num = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
+                                                 + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
+              for (int n = 0; n < loop_num; n++) {
                 for (auto &slot : pf) {
                   pf_str = "";
                   int vec_num;
@@ -225,6 +225,7 @@ namespace gbe
 
                   pf_str = "";
                 }
+
               }
             }
           }
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 29684ba..41cb722 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -330,9 +330,17 @@ error:
     Type* intTy;
     Value* pbuf_ptr;
     Value* index_buf_ptr;
+    Value* g1Xg2Xg3;
+    Value* wg_offset;
     int out_buf_sizeof_offset;
     static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
     int printf_num;
+    int totalSizeofSize;
+
+    struct PrintfParserInfo {
+      llvm::CallInst* call;
+      PrintfSet::PrintfFmt* printf_fmt;
+    };
 
     PrintfParser(void) : FunctionPass(ID)
     {
@@ -343,7 +351,10 @@ error:
       printfs.clear();
       pbuf_ptr = NULL;
       index_buf_ptr = NULL;
+      g1Xg2Xg3 = NULL;
+      wg_offset = NULL;
       printf_num = 0;
+      totalSizeofSize = 0;
     }
 
     ~PrintfParser(void)
@@ -355,9 +366,9 @@ error:
       printfs.clear();
     }
 
-
-    bool parseOnePrintfInstruction(CallInst *& call);
+    bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size);
     bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
+    bool generateOnePrintfInstruction(PrintfParserInfo& pInfo);
 
     virtual const char *getPassName() const
     {
@@ -367,119 +378,38 @@ error:
     virtual bool runOnFunction(llvm::Function &F);
   };
 
-  bool PrintfParser::parseOnePrintfInstruction(CallInst *& call)
+  bool PrintfParser::generateOnePrintfInstruction(PrintfParserInfo& pInfo)
   {
-    CallSite CS(call);
-    CallSite::arg_iterator CI_FMT = CS.arg_begin();
-    int param_num = 0;
-
-    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
-    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
-    if (!arg0_ptr) {
-      return false;
-    }
-
-    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
-    if (!fmt_arg || !fmt_arg->isCString()) {
-      return false;
-    }
-
-    std::string fmt = fmt_arg->getAsCString();
-
-    PrintfSet::PrintfFmt* printf_fmt = NULL;
-
-    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
-      return false;
-    }
-
-    /* iff parameter more than %, error. */
-    /* str_fmt arg0 arg1 ... NULL */
-    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
-      delete printf_fmt;
-      return false;
-    }
-
-    /* FIXME: Because the OpenCL language do not support va macro, and we do not want
-       to introduce the va_list, va_start and va_end into our code, we just simulate
-       the function calls to caculate the offset caculation here. */
-#define BUILD_CALL_INST(name) \
-    CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
-                             "__gen_ocl_get_"#name,                                         \
-                             IntegerType::getInt32Ty(module->getContext()),                 \
-                             NULL)))
-
-    BUILD_CALL_INST(group_id2);
-    BUILD_CALL_INST(group_id1);
-    BUILD_CALL_INST(group_id0);
-    BUILD_CALL_INST(global_size2);
-    BUILD_CALL_INST(global_size1);
-    BUILD_CALL_INST(global_size0);
-    BUILD_CALL_INST(local_id2);
-    BUILD_CALL_INST(local_id1);
-    BUILD_CALL_INST(local_id0);
-    BUILD_CALL_INST(local_size2);
-    BUILD_CALL_INST(local_size1);
-    BUILD_CALL_INST(local_size0);
-
-#undef BUILD_CALL_INST
-
     Value* op0 = NULL;
     Value* val = NULL;
-    /* calculate offset for later usage.
-       offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
-       + (local_id1 + local_size1 * group_id1) * global_size0
-       + (local_id0 + local_size0 * group_id0)) * sizeof(type)  */
-
-    // local_size2 * group_id2
-    val = builder->CreateMul(local_size2, group_id2);
-    // local_id2 + local_size2 * group_id2
-    val = builder->CreateAdd(local_id2, val);
-    // global_size1 * global_size0
-    op0 = builder->CreateMul(global_size1, global_size0);
-    // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
-    Value* offset1 = builder->CreateMul(val, op0);
-    // local_size1 * group_id1
-    val = builder->CreateMul(local_size1, group_id1);
-    // local_id1 + local_size1 * group_id1
-    val = builder->CreateAdd(local_id1, val);
-    // (local_id1 + local_size1 * group_id1) * global_size_0
-    Value* offset2 = builder->CreateMul(val, global_size0);
-    // local_size0 * group_id0
-    val = builder->CreateMul(local_size0, group_id0);
-    // local_id0 + local_size0 * group_id0
-    val = builder->CreateAdd(local_id0, val);
-    // The total sum
-    val = builder->CreateAdd(val, offset1);
-    Value* offset = builder->CreateAdd(val, offset2);
 
     /////////////////////////////////////////////////////
     /* calculate index address.
-       index_addr = (index_offset + offset )* sizeof(int) + index_buf_ptr
+       index_addr = (index_offset + wg_offset )* sizeof(int) + index_buf_ptr
        index_offset = global_size2 * global_size1 * global_size0 * printf_num */
 
-    // global_size2 * global_size1
-    op0 = builder->CreateMul(global_size2, global_size1);
-    // global_size2 * global_size1 * global_size0
-    Value* glXg2Xg3 = builder->CreateMul(op0, global_size0);
-    Value* index_offset = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, printf_num));
+    Value* index_offset = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, printf_num));
     // index_offset + offset
-    op0 = builder->CreateAdd(index_offset, offset);
+    op0 = builder->CreateAdd(index_offset, wg_offset);
     // (index_offset + offset)* sizeof(int)
     op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)));
     // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
     op0 = builder->CreateAdd(index_buf_ptr, op0);
     Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
-    builder->CreateStore(ConstantInt::get(intTy, 1), index_addr);// The flag
+    // Load the printf num first, printf may be in loop.
+    Value* loop_num = builder->CreateLoad(index_addr);
+    val = builder->CreateAdd(loop_num, ConstantInt::get(intTy, 1));
+    builder->CreateStore(val, index_addr);// The loop number.
 
     int i = 1;
     Value* data_addr = NULL;
-    for (auto &s : *printf_fmt) {
+    for (auto &s : *pInfo.printf_fmt) {
       if (s.type == PRINTF_SLOT_TYPE_STRING)
         continue;
 
-      assert(i < static_cast<int>(call->getNumOperands()) - 1);
+      assert(i < static_cast<int>(pInfo.call->getNumOperands()) - 1);
 
-      Value *out_arg = call->getOperand(i);
+      Value *out_arg = pInfo.call->getOperand(i);
       Type *dst_type = NULL;
       int sizeof_size = 0;
       if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
@@ -499,16 +429,23 @@ error:
 
       /////////////////////////////////////////////////////
       /* Calculate the data address.
-      data_addr = data_offset + pbuf_ptr + offset * sizeof(specify)
+      data_addr = (data_offset + pbuf_ptr + offset * sizeof(specify)) +
+               totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
       data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
 
       //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
-      op0 = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
+      op0 = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
       //offset * sizeof(specify)
-      val = builder->CreateMul(offset, ConstantInt::get(intTy, sizeof_size));
+      val = builder->CreateMul(wg_offset, ConstantInt::get(intTy, sizeof_size));
       //data_offset + pbuf_ptr
       op0 = builder->CreateAdd(pbuf_ptr, op0);
       op0 = builder->CreateAdd(op0, val);
+      //totalSizeofSize * global_size2 * global_size1 * global_size0
+      val = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, totalSizeofSize));
+      //totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
+      val = builder->CreateMul(val, loop_num);
+      //final
+      op0 = builder->CreateAdd(op0, val);
       data_addr = builder->CreateIntToPtr(op0, dst_type);
       builder->CreateStore(out_arg, data_addr);
 
@@ -520,14 +457,101 @@ error:
                               "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
                               NULL)));
     assert(printfs[printf_inst] == NULL);
-    printfs[printf_inst] = printf_fmt;
+    printfs[printf_inst] = pInfo.printf_fmt;
     printf_num++;
     return true;
   }
 
+  bool PrintfParser::parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size)
+  {
+    CallSite CS(call);
+    CallSite::arg_iterator CI_FMT = CS.arg_begin();
+    int param_num = 0;
+
+    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
+    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+    if (!arg0_ptr) {
+      return false;
+    }
+
+    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+    if (!fmt_arg || !fmt_arg->isCString()) {
+      return false;
+    }
+
+    std::string fmt = fmt_arg->getAsCString();
+
+    PrintfSet::PrintfFmt* printf_fmt = NULL;
+
+    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
+      return false;
+    }
+
+    /* iff parameter more than %, error. */
+    /* str_fmt arg0 arg1 ... NULL */
+    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
+      delete printf_fmt;
+      return false;
+    }
+
+    info.call = call;
+    info.printf_fmt = printf_fmt;
+
+    sizeof_size = 0;
+    int i = 1;
+    for (auto &s : *printf_fmt) {
+      int sz = 0;
+      if (s.type == PRINTF_SLOT_TYPE_STRING)
+        continue;
+
+      assert(i < static_cast<int>(call->getNumOperands()) - 1);
+
+      switch (s.state->conversion_specifier) {
+        case PRINTF_CONVERSION_I:
+        case PRINTF_CONVERSION_D:
+        case PRINTF_CONVERSION_O:
+        case PRINTF_CONVERSION_U:
+        case PRINTF_CONVERSION_x:
+        case PRINTF_CONVERSION_X:
+        case PRINTF_CONVERSION_P:
+          if (s.state->length_modifier == PRINTF_LM_L)
+            sz = sizeof(int64_t);
+          else
+            sz = sizeof(int);
+          break;
+        case PRINTF_CONVERSION_C:
+          sz = sizeof(char);
+          break;
+        case PRINTF_CONVERSION_F:
+        case PRINTF_CONVERSION_f:
+        case PRINTF_CONVERSION_E:
+        case PRINTF_CONVERSION_e:
+        case PRINTF_CONVERSION_G:
+        case PRINTF_CONVERSION_g:
+        case PRINTF_CONVERSION_A:
+        case PRINTF_CONVERSION_a:
+          sz = sizeof(float);
+          break;
+        default:
+          sz = 0;
+          break;
+      }
+
+      if (s.state->vector_n) {
+        sz = sz * s.state->vector_n;
+      }
+
+      sizeof_size += ((sz + 3) / 4) * 4;
+    }
+
+    return true;
+  }
+
+
   bool PrintfParser::runOnFunction(llvm::Function &F)
   {
     bool changed = false;
+    bool hasPrintf = false;
     switch (F.getCallingConv()) {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
       case CallingConv::PTX_Device:
@@ -541,6 +565,8 @@ error:
         GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
+    std::vector<PrintfParserInfo> infoVect;
+    totalSizeofSize = 0;
     module = F.getParent();
     intTy = IntegerType::get(module->getContext(), 32);
 
@@ -550,11 +576,14 @@ error:
 
     builder = new IRBuilder<>(module->getContext());
 
-    /* Iter the function and find printf. */
+    /* First find printfs and caculate all slots size of one loop. */
     for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
            instE = B->end(); instI != instE; ++instI) {
 
+        PrintfParserInfo pInfo;
+        int sizeof_size = 0;
+
         llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
         if (!call) {
           continue;
@@ -569,26 +598,107 @@ error:
         if (fnName != "__gen_ocl_printf_stub")
           continue;
 
-        changed = true;
-
-        builder->SetInsertPoint(call);
-
-        if (!pbuf_ptr) {
-          /* alloc a new buffer ptr to collect the print output. */
-          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
-          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
-          pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
-        }
-        if (!index_buf_ptr) {
-          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
-          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
-          index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+        if (!parseOnePrintfInstruction(call, pInfo, sizeof_size)) {
+          printf("Parse One printf inst failed, may have some error\n");
+          // Just kill this printf instruction.
+          deadprintfs.push_back(PrintfInst(cast<Instruction>(call),0));
+          continue;
         }
 
-        deadprintfs.push_back(PrintfInst(cast<Instruction>(call),parseOnePrintfInstruction(call)));
+        hasPrintf = true;
+
+        infoVect.push_back(pInfo);
+        totalSizeofSize += sizeof_size;
       }
     }
 
+    if (!hasPrintf)
+      return changed;
+
+    if (!pbuf_ptr) {
+      /* alloc a new buffer ptr to collect the print output. */
+      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
+      pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+    }
+    if (!index_buf_ptr) {
+      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
+      index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+    }
+
+    if (!wg_offset || !g1Xg2Xg3) {
+      Value* op0 = NULL;
+      Value* val = NULL;
+
+      builder->SetInsertPoint(F.begin()->begin());// Insert the common var in the begin.
+
+      /* FIXME: Because the OpenCL language do not support va macro, and we do not want
+         to introduce the va_list, va_start and va_end into our code, we just simulate
+         the function calls to caculate the offset caculation here. */
+#define BUILD_CALL_INST(name) \
+	CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
+				 "__gen_ocl_get_"#name, 					\
+				 IntegerType::getInt32Ty(module->getContext()), 		\
+				 NULL)))
+
+      BUILD_CALL_INST(group_id2);
+      BUILD_CALL_INST(group_id1);
+      BUILD_CALL_INST(group_id0);
+      BUILD_CALL_INST(global_size2);
+      BUILD_CALL_INST(global_size1);
+      BUILD_CALL_INST(global_size0);
+      BUILD_CALL_INST(local_id2);
+      BUILD_CALL_INST(local_id1);
+      BUILD_CALL_INST(local_id0);
+      BUILD_CALL_INST(local_size2);
+      BUILD_CALL_INST(local_size1);
+      BUILD_CALL_INST(local_size0);
+
+#undef BUILD_CALL_INST
+
+      /* calculate offset for later usage.
+         wg_offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+         + (local_id1 + local_size1 * group_id1) * global_size0
+         + (local_id0 + local_size0 * group_id0))  */
+
+      // local_size2 * group_id2
+      val = builder->CreateMul(local_size2, group_id2);
+      // local_id2 + local_size2 * group_id2
+      val = builder->CreateAdd(local_id2, val);
+      // global_size1 * global_size0
+      op0 = builder->CreateMul(global_size1, global_size0);
+      // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+      Value* offset1 = builder->CreateMul(val, op0);
+      // local_size1 * group_id1
+      val = builder->CreateMul(local_size1, group_id1);
+      // local_id1 + local_size1 * group_id1
+      val = builder->CreateAdd(local_id1, val);
+      // (local_id1 + local_size1 * group_id1) * global_size_0
+      Value* offset2 = builder->CreateMul(val, global_size0);
+      // local_size0 * group_id0
+      val = builder->CreateMul(local_size0, group_id0);
+      // local_id0 + local_size0 * group_id0
+      val = builder->CreateAdd(local_id0, val);
+      // The total sum
+      val = builder->CreateAdd(val, offset1);
+      wg_offset = builder->CreateAdd(val, offset2);
+
+      // global_size2 * global_size1
+      op0 = builder->CreateMul(global_size2, global_size1);
+      // global_size2 * global_size1 * global_size0
+      g1Xg2Xg3 = builder->CreateMul(op0, global_size0);
+    }
+
+
+    /* Now generate the instructions. */
+    for (auto pInfo : infoVect) {
+      builder->SetInsertPoint(pInfo.call);
+      deadprintfs.push_back(PrintfInst(cast<Instruction>(pInfo.call), generateOnePrintfInstruction(pInfo)));
+    }
+
+    assert(out_buf_sizeof_offset == totalSizeofSize);
+
     /* Replace the instruction's operand if using printf's return value. */
     for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
index c2844f4..0a59e88 100644
--- a/kernels/test_printf.cl
+++ b/kernels/test_printf.cl
@@ -4,6 +4,8 @@ test_printf(void)
   int x = (int)get_global_id(0);
   int y = (int)get_global_id(1);
   int z = (int)get_global_id(2);
+  int g0 = (int)get_global_size(0);
+  int g1 = (int)get_global_size(1);
   uint a = 'x';
   float f = 5.0f;
   int3 vec;
@@ -14,28 +16,31 @@ test_printf(void)
 
   if (x == 0 && y == 0 && z == 0) {
     printf("--- Welcome to the printf test of %s ---\n", "Intel Beignet");
-
     printf("### output a char is %c\n", a);
-
     printf("@@@ A long value is %ld\n", cc);
   }
 
-  if (x % 15 == 0)
-    if (y % 3 == 0)
-      if (z % 7 == 0)
-        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
-                vec, get_global_size(0), get_global_size(1), get_global_size(2));
+  for(int i = 0; i < g0/2; i++)
+    for(int j = 0; j < g1/2; j++)
+      if(x == 0 && y == 0 && z == 0)
+        printf("loops: i = %d, j = %d\n", i, j);
 
-  if (x == 1)
+  if (x == 0) {
     if (y == 0) {
       if (z % 2 == 0)
-          printf("#### output a float is %f\n", f);
+          printf("!!! output a float is %f\n", f);
       else
-          printf("#### output a float to int is %d\n", f);
+          printf("!!! output a float to int is %d\n", f);
     }
+  }
+
+  if (x % 15 == 0)
+    if (y % 3 == 0)
+      if (z % 7 == 0)
+        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
+                vec, get_global_size(0), get_global_size(1), get_global_size(2));
 
   if (x == 0 && y == 0 && z == 0) {
     printf("--- End to the printf test ---\n");
   }
-
 }
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 330f0f9..08e6b6e 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -270,7 +270,15 @@ cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num,
 
   value = GBE_CURBE_PRINTF_BUF_POINTER;
   offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
-  buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
+  buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;  
+  /* because of the printf may exist in a loop, which loop number can not be gotten by
+     static analysis. So we set the data buffer as big as we can. Out of bound printf
+     info will be discarded. */
+  if (buf_size < 1*1024)
+    buf_size = 1*1024*1024;
+  else
+    buf_size = 4*1024*1024; //at most.
+
   if (offset > 0) {
     if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
       return -1;
-- 
1.7.9.5



More information about the Beignet mailing list