[Beignet] [PATCH V2] Fit the printf bug in loop

Tue Oct 14 17:04:05 PDT 2014

LGTM, pushed, thanks.

On Tue, Oct 14, 2014 at 03:52:04PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> The static analyse for printf can not totally work
> when the printf inst is within the loop and the loop
> can not be unrolled. This causes the printf just to
> print one info for a loop and to lose all the others.
> We now increment the exec number every time the printf
> inst is triggered. The number is stored for output all
> the message later.
> The problem is that we can not caculate the exact loops
> number for each printf inst. The wrong loop number will
> cause the data overwritten. We now assume all the printf
> inst are in loop and store the data like this:
> | PRINTF1_DATA  PRINTF2_DATA ... | PRINTF1_DATA  PRINTF2_DATA ... | ...
> |       DATA_LOOP_ONE            |          DATA_LOOP_TWO         | ...
> Although this may cause some space wasted.
> 
> Another problem is that we need to decide the size of printf buffer
> because the loop upbound can not be caculated. We just set
> it yo 1M for small info slot request and 4M for big one.
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/ir/printf.cpp               |   13 +-
>  backend/src/llvm/llvm_printf_parser.cpp |  376 +++++++++++++++++++++----------
>  kernels/test_printf.cl                  |   27 ++-
>  src/cl_command_queue_gen7.c             |    8 +
>  4 files changed, 283 insertions(+), 141 deletions(-)
> 
> diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
> index e99aad5..093bfc6 100644
> --- a/backend/src/ir/printf.cpp
> +++ b/backend/src/ir/printf.cpp
> @@ -106,8 +106,9 @@ namespace gbe
>        if (!vec_i)                                                       \
>          pf_str = pf_str + std::string(#conv);                           \
>        printf(pf_str.c_str(),                                            \
> -             ((target_ty *)((char *)buf_addr + slot.state->out_buf_sizeof_offset * \
> -                            global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
> +             ((target_ty *)((char *)buf_addr + sizeOfSize * global_wk_sz0 * global_wk_sz1 * global_wk_sz2 * n \
> +                                              + slot.state->out_buf_sizeof_offset * \
> +                                                         global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
>               [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
>      } while (0)
>  
> @@ -124,10 +125,9 @@ namespace gbe
>          for (i = 0; i < global_wk_sz0; i++) {
>            for (j = 0; j < global_wk_sz1; j++) {
>              for (k = 0; k < global_wk_sz2; k++) {
> -
> -              int flag = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
> -                                             + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
> -              if (flag) {
> +              int loop_num = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
> +                                                 + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
> +              for (int n = 0; n < loop_num; n++) {
>                  for (auto &slot : pf) {
>                    pf_str = "";
>                    int vec_num;
> @@ -225,6 +225,7 @@ namespace gbe
>  
>                    pf_str = "";
>                  }
> +
>                }
>              }
>            }
> diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
> index 29684ba..bb6c26d 100644
> --- a/backend/src/llvm/llvm_printf_parser.cpp
> +++ b/backend/src/llvm/llvm_printf_parser.cpp
> @@ -330,9 +330,17 @@ error:
>      Type* intTy;
>      Value* pbuf_ptr;
>      Value* index_buf_ptr;
> +    Value* g1Xg2Xg3;
> +    Value* wg_offset;
>      int out_buf_sizeof_offset;
>      static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
>      int printf_num;
> +    int totalSizeofSize;
> +
> +    struct PrintfParserInfo {
> +      llvm::CallInst* call;
> +      PrintfSet::PrintfFmt* printf_fmt;
> +    };
>  
>      PrintfParser(void) : FunctionPass(ID)
>      {
> @@ -343,7 +351,10 @@ error:
>        printfs.clear();
>        pbuf_ptr = NULL;
>        index_buf_ptr = NULL;
> +      g1Xg2Xg3 = NULL;
> +      wg_offset = NULL;
>        printf_num = 0;
> +      totalSizeofSize = 0;
>      }
>  
>      ~PrintfParser(void)
> @@ -355,9 +366,9 @@ error:
>        printfs.clear();
>      }
>  
> -
> -    bool parseOnePrintfInstruction(CallInst *& call);
> +    bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size);
>      bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
> +    bool generateOnePrintfInstruction(PrintfParserInfo& pInfo);
>  
>      virtual const char *getPassName() const
>      {
> @@ -367,119 +378,38 @@ error:
>      virtual bool runOnFunction(llvm::Function &F);
>    };
>  
> -  bool PrintfParser::parseOnePrintfInstruction(CallInst *& call)
> +  bool PrintfParser::generateOnePrintfInstruction(PrintfParserInfo& pInfo)
>    {
> -    CallSite CS(call);
> -    CallSite::arg_iterator CI_FMT = CS.arg_begin();
> -    int param_num = 0;
> -
> -    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
> -    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
> -    if (!arg0_ptr) {
> -      return false;
> -    }
> -
> -    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
> -    if (!fmt_arg || !fmt_arg->isCString()) {
> -      return false;
> -    }
> -
> -    std::string fmt = fmt_arg->getAsCString();
> -
> -    PrintfSet::PrintfFmt* printf_fmt = NULL;
> -
> -    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
> -      return false;
> -    }
> -
> -    /* iff parameter more than %, error. */
> -    /* str_fmt arg0 arg1 ... NULL */
> -    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
> -      delete printf_fmt;
> -      return false;
> -    }
> -
> -    /* FIXME: Because the OpenCL language do not support va macro, and we do not want
> -       to introduce the va_list, va_start and va_end into our code, we just simulate
> -       the function calls to caculate the offset caculation here. */
> -#define BUILD_CALL_INST(name) \
> -    CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
> -                             "__gen_ocl_get_"#name,                                         \
> -                             IntegerType::getInt32Ty(module->getContext()),                 \
> -                             NULL)))
> -
> -    BUILD_CALL_INST(group_id2);
> -    BUILD_CALL_INST(group_id1);
> -    BUILD_CALL_INST(group_id0);
> -    BUILD_CALL_INST(global_size2);
> -    BUILD_CALL_INST(global_size1);
> -    BUILD_CALL_INST(global_size0);
> -    BUILD_CALL_INST(local_id2);
> -    BUILD_CALL_INST(local_id1);
> -    BUILD_CALL_INST(local_id0);
> -    BUILD_CALL_INST(local_size2);
> -    BUILD_CALL_INST(local_size1);
> -    BUILD_CALL_INST(local_size0);
> -
> -#undef BUILD_CALL_INST
> -
>      Value* op0 = NULL;
>      Value* val = NULL;
> -    /* calculate offset for later usage.
> -       offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
> -       + (local_id1 + local_size1 * group_id1) * global_size0
> -       + (local_id0 + local_size0 * group_id0)) * sizeof(type)  */
> -
> -    // local_size2 * group_id2
> -    val = builder->CreateMul(local_size2, group_id2);
> -    // local_id2 + local_size2 * group_id2
> -    val = builder->CreateAdd(local_id2, val);
> -    // global_size1 * global_size0
> -    op0 = builder->CreateMul(global_size1, global_size0);
> -    // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
> -    Value* offset1 = builder->CreateMul(val, op0);
> -    // local_size1 * group_id1
> -    val = builder->CreateMul(local_size1, group_id1);
> -    // local_id1 + local_size1 * group_id1
> -    val = builder->CreateAdd(local_id1, val);
> -    // (local_id1 + local_size1 * group_id1) * global_size_0
> -    Value* offset2 = builder->CreateMul(val, global_size0);
> -    // local_size0 * group_id0
> -    val = builder->CreateMul(local_size0, group_id0);
> -    // local_id0 + local_size0 * group_id0
> -    val = builder->CreateAdd(local_id0, val);
> -    // The total sum
> -    val = builder->CreateAdd(val, offset1);
> -    Value* offset = builder->CreateAdd(val, offset2);
>  
>      /////////////////////////////////////////////////////
>      /* calculate index address.
> -       index_addr = (index_offset + offset )* sizeof(int) + index_buf_ptr
> +       index_addr = (index_offset + wg_offset )* sizeof(int) + index_buf_ptr
>         index_offset = global_size2 * global_size1 * global_size0 * printf_num */
>  
> -    // global_size2 * global_size1
> -    op0 = builder->CreateMul(global_size2, global_size1);
> -    // global_size2 * global_size1 * global_size0
> -    Value* glXg2Xg3 = builder->CreateMul(op0, global_size0);
> -    Value* index_offset = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, printf_num));
> +    Value* index_offset = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, printf_num));
>      // index_offset + offset
> -    op0 = builder->CreateAdd(index_offset, offset);
> +    op0 = builder->CreateAdd(index_offset, wg_offset);
>      // (index_offset + offset)* sizeof(int)
>      op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)));
>      // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
>      op0 = builder->CreateAdd(index_buf_ptr, op0);
>      Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
> -    builder->CreateStore(ConstantInt::get(intTy, 1), index_addr);// The flag
> +    // Load the printf num first, printf may be in loop.
> +    Value* loop_num = builder->CreateLoad(index_addr);
> +    val = builder->CreateAdd(loop_num, ConstantInt::get(intTy, 1));
> +    builder->CreateStore(val, index_addr);// The loop number.
>  
>      int i = 1;
>      Value* data_addr = NULL;
> -    for (auto &s : *printf_fmt) {
> +    for (auto &s : *pInfo.printf_fmt) {
>        if (s.type == PRINTF_SLOT_TYPE_STRING)
>          continue;
>  
> -      assert(i < static_cast<int>(call->getNumOperands()) - 1);
> +      assert(i < static_cast<int>(pInfo.call->getNumOperands()) - 1);
>  
> -      Value *out_arg = call->getOperand(i);
> +      Value *out_arg = pInfo.call->getOperand(i);
>        Type *dst_type = NULL;
>        int sizeof_size = 0;
>        if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
> @@ -499,16 +429,23 @@ error:
>  
>        /////////////////////////////////////////////////////
>        /* Calculate the data address.
> -      data_addr = data_offset + pbuf_ptr + offset * sizeof(specify)
> +      data_addr = (data_offset + pbuf_ptr + offset * sizeof(specify)) +
> +               totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
>        data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
>  
>        //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
> -      op0 = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
> +      op0 = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
>        //offset * sizeof(specify)
> -      val = builder->CreateMul(offset, ConstantInt::get(intTy, sizeof_size));
> +      val = builder->CreateMul(wg_offset, ConstantInt::get(intTy, sizeof_size));
>        //data_offset + pbuf_ptr
>        op0 = builder->CreateAdd(pbuf_ptr, op0);
>        op0 = builder->CreateAdd(op0, val);
> +      //totalSizeofSize * global_size2 * global_size1 * global_size0
> +      val = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, totalSizeofSize));
> +      //totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
> +      val = builder->CreateMul(val, loop_num);
> +      //final
> +      op0 = builder->CreateAdd(op0, val);
>        data_addr = builder->CreateIntToPtr(op0, dst_type);
>        builder->CreateStore(out_arg, data_addr);
>  
> @@ -520,14 +457,101 @@ error:
>                                "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
>                                NULL)));
>      assert(printfs[printf_inst] == NULL);
> -    printfs[printf_inst] = printf_fmt;
> +    printfs[printf_inst] = pInfo.printf_fmt;
>      printf_num++;
>      return true;
>    }
>  
> +  bool PrintfParser::parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size)
> +  {
> +    CallSite CS(call);
> +    CallSite::arg_iterator CI_FMT = CS.arg_begin();
> +    int param_num = 0;
> +
> +    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
> +    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
> +    if (!arg0_ptr) {
> +      return false;
> +    }
> +
> +    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
> +    if (!fmt_arg || !fmt_arg->isCString()) {
> +      return false;
> +    }
> +
> +    std::string fmt = fmt_arg->getAsCString();
> +
> +    PrintfSet::PrintfFmt* printf_fmt = NULL;
> +
> +    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
> +      return false;
> +    }
> +
> +    /* iff parameter more than %, error. */
> +    /* str_fmt arg0 arg1 ... NULL */
> +    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
> +      delete printf_fmt;
> +      return false;
> +    }
> +
> +    info.call = call;
> +    info.printf_fmt = printf_fmt;
> +
> +    sizeof_size = 0;
> +    int i = 1;
> +    for (auto &s : *printf_fmt) {
> +      int sz = 0;
> +      if (s.type == PRINTF_SLOT_TYPE_STRING)
> +        continue;
> +
> +      assert(i < static_cast<int>(call->getNumOperands()) - 1);
> +
> +      switch (s.state->conversion_specifier) {
> +        case PRINTF_CONVERSION_I:
> +        case PRINTF_CONVERSION_D:
> +        case PRINTF_CONVERSION_O:
> +        case PRINTF_CONVERSION_U:
> +        case PRINTF_CONVERSION_x:
> +        case PRINTF_CONVERSION_X:
> +        case PRINTF_CONVERSION_P:
> +          if (s.state->length_modifier == PRINTF_LM_L)
> +            sz = sizeof(int64_t);
> +          else
> +            sz = sizeof(int);
> +          break;
> +        case PRINTF_CONVERSION_C:
> +          sz = sizeof(char);
> +          break;
> +        case PRINTF_CONVERSION_F:
> +        case PRINTF_CONVERSION_f:
> +        case PRINTF_CONVERSION_E:
> +        case PRINTF_CONVERSION_e:
> +        case PRINTF_CONVERSION_G:
> +        case PRINTF_CONVERSION_g:
> +        case PRINTF_CONVERSION_A:
> +        case PRINTF_CONVERSION_a:
> +          sz = sizeof(float);
> +          break;
> +        default:
> +          sz = 0;
> +          break;
> +      }
> +
> +      if (s.state->vector_n) {
> +        sz = sz * s.state->vector_n;
> +      }
> +
> +      sizeof_size += ((sz + 3) / 4) * 4;
> +    }
> +
> +    return true;
> +  }
> +
> +
>    bool PrintfParser::runOnFunction(llvm::Function &F)
>    {
>      bool changed = false;
> +    bool hasPrintf = false;
>      switch (F.getCallingConv()) {
>  #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
>        case CallingConv::PTX_Device:
> @@ -541,6 +565,8 @@ error:
>          GBE_ASSERTM(false, "Unsupported calling convention");
>      }
>  
> +    std::vector<PrintfParserInfo> infoVect;
> +    totalSizeofSize = 0;
>      module = F.getParent();
>      intTy = IntegerType::get(module->getContext(), 32);
>  
> @@ -550,11 +576,14 @@ error:
>  
>      builder = new IRBuilder<>(module->getContext());
>  
> -    /* Iter the function and find printf. */
> +    /* First find printfs and caculate all slots size of one loop. */
>      for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
>        for (BasicBlock::iterator instI = B->begin(),
>             instE = B->end(); instI != instE; ++instI) {
>  
> +        PrintfParserInfo pInfo;
> +        int sizeof_size = 0;
> +
>          llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
>          if (!call) {
>            continue;
> @@ -569,26 +598,107 @@ error:
>          if (fnName != "__gen_ocl_printf_stub")
>            continue;
>  
> -        changed = true;
> -
> -        builder->SetInsertPoint(call);
> -
> -        if (!pbuf_ptr) {
> -          /* alloc a new buffer ptr to collect the print output. */
> -          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
> -          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
> -          pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
> -        }
> -        if (!index_buf_ptr) {
> -          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
> -          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
> -          index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
> +        if (!parseOnePrintfInstruction(call, pInfo, sizeof_size)) {
> +          printf("Parse One printf inst failed, may have some error\n");
> +          // Just kill this printf instruction.
> +          deadprintfs.push_back(PrintfInst(cast<Instruction>(call),0));
> +          continue;
>          }
>  
> -        deadprintfs.push_back(PrintfInst(cast<Instruction>(call),parseOnePrintfInstruction(call)));
> +        hasPrintf = true;
> +
> +        infoVect.push_back(pInfo);
> +        totalSizeofSize += sizeof_size;
>        }
>      }
>  
> +    if (!hasPrintf)
> +      return changed;
> +
> +    if (!pbuf_ptr) {
> +      /* alloc a new buffer ptr to collect the print output. */
> +      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
> +      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
> +      pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
> +    }
> +    if (!index_buf_ptr) {
> +      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
> +      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
> +      index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
> +    }
> +
> +    if (!wg_offset || !g1Xg2Xg3) {
> +      Value* op0 = NULL;
> +      Value* val = NULL;
> +
> +      builder->SetInsertPoint(F.begin()->begin());// Insert the common var in the begin.
> +
> +      /* FIXME: Because the OpenCL language do not support va macro, and we do not want
> +         to introduce the va_list, va_start and va_end into our code, we just simulate
> +         the function calls to caculate the offset caculation here. */
> +#define BUILD_CALL_INST(name) \
> +	CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
> +				 "__gen_ocl_get_"#name, 					\
> +				 IntegerType::getInt32Ty(module->getContext()), 		\
> +				 NULL)))
> +
> +      BUILD_CALL_INST(group_id2);
> +      BUILD_CALL_INST(group_id1);
> +      BUILD_CALL_INST(group_id0);
> +      BUILD_CALL_INST(global_size2);
> +      BUILD_CALL_INST(global_size1);
> +      BUILD_CALL_INST(global_size0);
> +      BUILD_CALL_INST(local_id2);
> +      BUILD_CALL_INST(local_id1);
> +      BUILD_CALL_INST(local_id0);
> +      BUILD_CALL_INST(local_size2);
> +      BUILD_CALL_INST(local_size1);
> +      BUILD_CALL_INST(local_size0);
> +
> +#undef BUILD_CALL_INST
> +
> +      /* calculate offset for later usage.
> +         wg_offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
> +         + (local_id1 + local_size1 * group_id1) * global_size0
> +         + (local_id0 + local_size0 * group_id0))  */
> +
> +      // local_size2 * group_id2
> +      val = builder->CreateMul(local_size2, group_id2);
> +      // local_id2 + local_size2 * group_id2
> +      val = builder->CreateAdd(local_id2, val);
> +      // global_size1 * global_size0
> +      op0 = builder->CreateMul(global_size1, global_size0);
> +      // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
> +      Value* offset1 = builder->CreateMul(val, op0);
> +      // local_size1 * group_id1
> +      val = builder->CreateMul(local_size1, group_id1);
> +      // local_id1 + local_size1 * group_id1
> +      val = builder->CreateAdd(local_id1, val);
> +      // (local_id1 + local_size1 * group_id1) * global_size_0
> +      Value* offset2 = builder->CreateMul(val, global_size0);
> +      // local_size0 * group_id0
> +      val = builder->CreateMul(local_size0, group_id0);
> +      // local_id0 + local_size0 * group_id0
> +      val = builder->CreateAdd(local_id0, val);
> +      // The total sum
> +      val = builder->CreateAdd(val, offset1);
> +      wg_offset = builder->CreateAdd(val, offset2);
> +
> +      // global_size2 * global_size1
> +      op0 = builder->CreateMul(global_size2, global_size1);
> +      // global_size2 * global_size1 * global_size0
> +      g1Xg2Xg3 = builder->CreateMul(op0, global_size0);
> +    }
> +
> +
> +    /* Now generate the instructions. */
> +    for (auto pInfo : infoVect) {
> +      builder->SetInsertPoint(pInfo.call);
> +      deadprintfs.push_back(PrintfInst(cast<Instruction>(pInfo.call), generateOnePrintfInstruction(pInfo)));
> +    }
> +
> +    assert(out_buf_sizeof_offset == totalSizeofSize);
> +
>      /* Replace the instruction's operand if using printf's return value. */
>      for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
>        for (BasicBlock::iterator instI = B->begin(),
> @@ -775,6 +885,7 @@ error:
>          bool sign = false;
>  
>          if (vec_num != slot.state->vector_n) {
> +          printf("Error The printf vector number is not match!\n");
>            return false;
>          }
>  
> @@ -785,26 +896,37 @@ error:
>            case PRINTF_CONVERSION_O:
>            case PRINTF_CONVERSION_U:
>            case PRINTF_CONVERSION_x:
> -          case PRINTF_CONVERSION_X:
> -            if (elt_type->getTypeID() != Type::IntegerTyID)
> +          case PRINTF_CONVERSION_X: {
> +            if (elt_type->getTypeID() != Type::IntegerTyID) {
> +              printf("Do not support type conversion between float and int in vector printf!\n");
>                return false;
> +            }
> +
> +            Type* elt_dst_type = NULL;
> +            if (slot.state->length_modifier == PRINTF_LM_L) {
> +              elt_dst_type = Type::getInt64Ty(elt_type->getContext());
> +            } else {
> +              elt_dst_type = Type::getInt32Ty(elt_type->getContext());
> +            }
>  
>              /* If the bits change, we need to consider the signed. */
> -            if (elt_type != Type::getInt32Ty(elt_type->getContext())) {
> +            if (elt_type != elt_dst_type) {
>                Value *II = NULL;
>                for (int i = 0; i < vec_num; i++) {
> -                Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getInt32Ty(elt_type->getContext()), vec_num));
> +                Value *vec = II ? II : UndefValue::get(VectorType::get(elt_dst_type, vec_num));
>                  Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
>                  Value *org = builder->CreateExtractElement(arg, cv);
> -                Value *cvt = builder->CreateIntCast(org, Type::getInt32Ty(module->getContext()), sign);
> +                Value *cvt = builder->CreateIntCast(org, elt_dst_type, sign);
>                  II = builder->CreateInsertElement(vec, cvt, cv);
>                }
>                arg = II;
>              }
>  
>              dst_type = arg->getType()->getPointerTo(1);
> -            sizeof_size = sizeof(int) * vec_num;
> +            sizeof_size = (elt_dst_type == Type::getInt32Ty(elt_type->getContext()) ?
> +                           sizeof(int) * vec_num  : sizeof(int64_t) * vec_num);
>              return true;
> +          }
>  
>            case PRINTF_CONVERSION_F:
>            case PRINTF_CONVERSION_f:
> @@ -814,8 +936,10 @@ error:
>            case PRINTF_CONVERSION_g:
>            case PRINTF_CONVERSION_A:
>            case PRINTF_CONVERSION_a:
> -            if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID)
> +            if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID) {
> +              printf("Do not support type conversion between float and int in vector printf!\n");
>                return false;
> +            }
>  
>              if (elt_type->getTypeID() != Type::FloatTyID) {
>                Value *II = NULL;
> @@ -828,10 +952,14 @@ error:
>                }
>                arg = II;
>              }
> +
> +            dst_type = arg->getType()->getPointerTo(1);
> +            sizeof_size = sizeof(int) * vec_num;
> +            return true;
> +
> +          default:
> +            return false;
>          }
> -        dst_type = arg->getType()->getPointerTo(1);
> -        sizeof_size = sizeof(int) * vec_num;
> -        return true;
>        }
>  
>        default:
> diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
> index c2844f4..0a59e88 100644
> --- a/kernels/test_printf.cl
> +++ b/kernels/test_printf.cl
> @@ -4,6 +4,8 @@ test_printf(void)
>    int x = (int)get_global_id(0);
>    int y = (int)get_global_id(1);
>    int z = (int)get_global_id(2);
> +  int g0 = (int)get_global_size(0);
> +  int g1 = (int)get_global_size(1);
>    uint a = 'x';
>    float f = 5.0f;
>    int3 vec;
> @@ -14,28 +16,31 @@ test_printf(void)
>  
>    if (x == 0 && y == 0 && z == 0) {
>      printf("--- Welcome to the printf test of %s ---\n", "Intel Beignet");
> -
>      printf("### output a char is %c\n", a);
> -
>      printf("@@@ A long value is %ld\n", cc);
>    }
>  
> -  if (x % 15 == 0)
> -    if (y % 3 == 0)
> -      if (z % 7 == 0)
> -        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
> -                vec, get_global_size(0), get_global_size(1), get_global_size(2));
> +  for(int i = 0; i < g0/2; i++)
> +    for(int j = 0; j < g1/2; j++)
> +      if(x == 0 && y == 0 && z == 0)
> +        printf("loops: i = %d, j = %d\n", i, j);
>  
> -  if (x == 1)
> +  if (x == 0) {
>      if (y == 0) {
>        if (z % 2 == 0)
> -          printf("#### output a float is %f\n", f);
> +          printf("!!! output a float is %f\n", f);
>        else
> -          printf("#### output a float to int is %d\n", f);
> +          printf("!!! output a float to int is %d\n", f);
>      }
> +  }
> +
> +  if (x % 15 == 0)
> +    if (y % 3 == 0)
> +      if (z % 7 == 0)
> +        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
> +                vec, get_global_size(0), get_global_size(1), get_global_size(2));
>  
>    if (x == 0 && y == 0 && z == 0) {
>      printf("--- End to the printf test ---\n");
>    }
> -
>  }
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 330f0f9..b020540 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -271,6 +271,14 @@ cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num,
>    value = GBE_CURBE_PRINTF_BUF_POINTER;
>    offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
>    buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
> +  /* because of the printf may exist in a loop, which loop number can not be gotten by
> +     static analysis. So we set the data buffer as big as we can. Out of bound printf
> +     info will be discarded. */
> +  if (buf_size < 1*1024)
> +    buf_size = 1*1024*1024;
> +  else
> +    buf_size = 4*1024*1024; //at most.
> +
>    if (offset > 0) {
>      if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
>        return -1;
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet