From steve at snewbury.org.uk  Sun Mar  1 07:09:23 2015
From: steve at snewbury.org.uk (Steven Newbury)
Date: Sun, 01 Mar 2015 15:09:23 +0000
Subject: [Beignet] [PATCH 2/3] Add llvm3.6 build support.
In-Reply-To: <1423729781-17667-2-git-send-email-rong.r.yang@intel.com>
References: <1423729781-17667-1-git-send-email-rong.r.yang@intel.com>
 <1423729781-17667-2-git-send-email-rong.r.yang@intel.com>
Message-ID: <1425222563.26181.4.camel@snewbury.org.uk>

I now have another failure to build with llvm3.6 (release) not sure if it's an
LLVM or flag change.  It would build with "-fpermissive".  See in-line.

On Thu, 2015-02-12 at 16:29 +0800, Yang Rong wrote:
> There are some changes from llvm3.5:
> 1. Some functions return std::unique_ptr instead of pointer.
> 2. MetaNode to Value and Value to MetaNode.
> 
> V2: Fix llvm3.5 build error.
> V3: Print link and function materialize message.
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  backend/src/backend/gen_program.cpp    | 10 ++++++++++
>  backend/src/backend/program.cpp        |  4 ++++
>  backend/src/llvm/llvm_bitcode_link.cpp | 25 
> ++++++++++++++++++++++++-
>  backend/src/llvm/llvm_gen_backend.cpp  | 22 +++++++++++++++++++++-
>  backend/src/llvm/llvm_passes.cpp       |  4 ++++
>  backend/src/llvm/llvm_to_gen.cpp       | 16 +++++++++++++---
>  backend/src/llvm/llvm_unroll.cpp       | 14 ++++++++++++++
>  7 files changed, 90 insertions(+), 5 deletions(-)
> 
> diff --git a/backend/src/backend/gen_program.cpp 
> b/backend/src/backend/gen_program.cpp
> index a4019fe..65a7ba2 100644
> --- a/backend/src/backend/gen_program.cpp
> +++ b/backend/src/backend/gen_program.cpp
> @@ -252,9 +252,15 @@ namespace gbe {
>      llvm::StringRef llvm_bin_str(binary_content);
>      llvm::LLVMContext& c = llvm::getGlobalContext();
>      llvm::SMDiagnostic Err;
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +    std::unique_ptr<llvm::MemoryBuffer> memory_buffer = 
> llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
> +    acquireLLVMContextLock();
> +    llvm::Module* module = llvm::parseIR(memory_buffer-
> >getMemBufferRef(), Err, c).release();
> +#else
>      llvm::MemoryBuffer* memory_buffer = 
> llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
>      acquireLLVMContextLock();
>      llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
> +#endif
>      releaseLLVMContextLock();
>      if(module == NULL){
>        GBE_ASSERT(0);
> @@ -382,7 +388,11 @@ namespace gbe {
>        llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)-
> >module;
>        llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)-
> >module;
>  
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>        if (LLVMLinkModules(wrap(dst), wrap(src), 
> LLVMLinkerPreserveSource, &errMsg)) {
> +#else
> +      if (LLVMLinkModules(wrap(dst), wrap(src), 0, &errMsg)) {

I get a failure to convert from int to LLVMLinkerMode from the 3rd 
argument here.  The constant '0' needs to be cast to LLVMLinkerMode to 
keep LLVMLinkModules() happy since it doesn't accept an int argument.

> +#endif
>          if (err != NULL && errSize != NULL && stringSize > 0u) {
>            if(strlen(errMsg) < stringSize )
>              stringSize = strlen(errMsg);
> diff --git a/backend/src/backend/program.cpp 
> b/backend/src/backend/program.cpp
> index 38ce9c8..06810bd 100644
> --- a/backend/src/backend/program.cpp
> +++ b/backend/src/backend/program.cpp
> @@ -621,7 +621,11 @@ namespace gbe {
>      if (!retVal)
>        return false;
>  
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>      llvm::Module *module = Act->takeModule();
> +#else
> +    llvm::Module *module = Act->takeModule().release();
> +#endif
>  
>      *out_module = module;
>      return true;
> diff --git a/backend/src/llvm/llvm_bitcode_link.cpp 
> b/backend/src/llvm/llvm_bitcode_link.cpp
> index 8eb6dd5..229e3bb 100644
> --- a/backend/src/llvm/llvm_bitcode_link.cpp
> +++ b/backend/src/llvm/llvm_bitcode_link.cpp
> @@ -63,7 +63,11 @@ namespace gbe
>      }
>      assert(findBC);
>  
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>      oclLib = getLazyIRFileModule(FilePath, Err, ctx);
> +#else
> +    oclLib = getLazyIRFileModule(FilePath, Err, ctx).release();
> +#endif
>      if (!oclLib) {
>        printf("Fatal Error: ocl lib can not be opened\n");
>        return NULL;
> @@ -114,12 +118,18 @@ namespace gbe
>  
>          std::string ErrInfo;// = "Not Materializable";
>          if (!fromSrc && newMF->isMaterializable()) {
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>            if (newMF->Materialize(&ErrInfo)) {
>              printf("Can not materialize the function: %s, because 
> %s\n", fnName.c_str(), ErrInfo.c_str());
>              return false;
>            }
> +#else
> +          if (std::error_code EC = newMF->materialize()) {
> +            printf("Can not materialize the function: %s, because 
> %s\n", fnName.c_str(), EC.message().c_str());
> +            return false;
> +          }
> +#endif
>          }
> -
>          if (!materializedFuncCall(src, lib, *newMF, MFS))
>            return false;
>  
> @@ -205,12 +215,21 @@ namespace gbe
>        }
>        std::string ErrInfo;// = "Not Materializable";
>        if (newMF->isMaterializable()) {
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>          if (newMF->Materialize(&ErrInfo)) {
>            printf("Can not materialize the function: %s, because 
> %s\n", fnName.c_str(), ErrInfo.c_str());
>            delete clonedLib;
>            return NULL;
>          }
>        }
> +#else
> +        if (std::error_code EC = newMF->materialize()) {
> +          printf("Can not materialize the function: %s, because 
> %s\n", fnName.c_str(), EC.message().c_str();
> +          delete clonedLib;
> +          return NULL;
> +        }
> +      }
> +#endif
>  
>        if (!materializedFuncCall(*mod, *clonedLib, *newMF, 
> materializedFuncs)) {
>          delete clonedLib;
> @@ -223,7 +242,11 @@ namespace gbe
>      /* We use beignet's bitcode as dst because it will have a lot of
>         lazy functions which will not be loaded. */
>      char* errorMsg;
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>      if(LLVMLinkModules(wrap(clonedLib), wrap(mod), 
> LLVMLinkerDestroySource, &errorMsg)) {
> +#else
> +    if(LLVMLinkModules(wrap(clonedLib), wrap(mod), 0, &errorMsg)) {

Same here.

> +#endif
>        delete clonedLib;
>        printf("Fatal Error: link the bitcode error:\n%s\n", 
> errorMsg);
>        return NULL;
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index d47721a..c67a880 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -1467,7 +1467,12 @@ error:
>      /* First find the meta data belong to this function. */
>      for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
>        node = clKernelMetaDatas->getOperand(i);
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>        if (node->getOperand(0) == &F) break;
> +#else
> +      auto *V = cast<ValueAsMetadata>(node->getOperand(0));
> +      if (V && V->getValue() == &F) break;
> +#endif
>        node = NULL;
>      }
>  
> @@ -1484,9 +1489,15 @@ error:
>  
>        if (attrName->getString() == "reqd_work_group_size") {
>          GBE_ASSERT(attrNode->getNumOperands() == 4);
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>          ConstantInt *x = dyn_cast<ConstantInt>(attrNode-
> >getOperand(1));
>          ConstantInt *y = dyn_cast<ConstantInt>(attrNode-
> >getOperand(2));
>          ConstantInt *z = dyn_cast<ConstantInt>(attrNode-
> >getOperand(3));
> +#else
> +        ConstantInt *x = mdconst::extract<ConstantInt>(attrNode-
> >getOperand(1));
> +        ConstantInt *y = mdconst::extract<ConstantInt>(attrNode-
> >getOperand(2));
> +        ConstantInt *z = mdconst::extract<ConstantInt>(attrNode-
> >getOperand(3));
> +#endif
>          GBE_ASSERT(x && y && z);
>          reqd_wg_sz[0] = x->getZExtValue();
>          reqd_wg_sz[1] = y->getZExtValue();
> @@ -1521,9 +1532,15 @@ error:
>          functionAttributes += " ";
>        } else if (attrName->getString() == "work_group_size_hint") {
>          GBE_ASSERT(attrNode->getNumOperands() == 4);
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>          ConstantInt *x = dyn_cast<ConstantInt>(attrNode-
> >getOperand(1));
>          ConstantInt *y = dyn_cast<ConstantInt>(attrNode-
> >getOperand(2));
>          ConstantInt *z = dyn_cast<ConstantInt>(attrNode-
> >getOperand(3));
> +#else
> +        ConstantInt *x = mdconst::extract<ConstantInt>(attrNode-
> >getOperand(1));
> +        ConstantInt *y = mdconst::extract<ConstantInt>(attrNode-
> >getOperand(2));
> +        ConstantInt *z = mdconst::extract<ConstantInt>(attrNode-
> >getOperand(3));
> +#endif
>          GBE_ASSERT(x && y && z);
>          hint_wg_sz[0] = x->getZExtValue();
>          hint_wg_sz[1] = y->getZExtValue();
> @@ -1561,8 +1578,11 @@ error:
>        for (; I != E; ++I, ++argID) {
>          const std::string &argName = I->getName().str();
>          Type *type = I->getType();
> -
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>          llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode-
> >getOperand(1 + argID)))->getZExtValue();
> +#else
> +        llvmInfo.addrSpace = 
> (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(1 + 
> argID)))->getZExtValue();
> +#endif
>          llvmInfo.typeName = (cast<MDString>(typeNameNode-
> >getOperand(1 + argID)))->getString();
>          llvmInfo.accessQual = (cast<MDString>(accessQualNode-
> >getOperand(1 + argID)))->getString();
>          llvmInfo.typeQual = (cast<MDString>(typeQualNode-
> >getOperand(1 + argID)))->getString();
> diff --git a/backend/src/llvm/llvm_passes.cpp 
> b/backend/src/llvm/llvm_passes.cpp
> index 5c0a2e0..1b0e4f8 100644
> --- a/backend/src/llvm/llvm_passes.cpp
> +++ b/backend/src/llvm/llvm_passes.cpp
> @@ -119,7 +119,11 @@ namespace gbe
>        uint32_t ops = md.getNumOperands();
>        for(uint32_t x = 0; x < ops; x++) {
>          MDNode* node = md.getOperand(x);
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>          Value * op = node->getOperand(0);
> +#else
> +        Value * op = cast<ValueAsMetadata>(node->getOperand(0))-
> >getValue();
> +#endif
>          if(op == &F) bKernel = true;
>        }
>      }
> diff --git a/backend/src/llvm/llvm_to_gen.cpp 
> b/backend/src/llvm/llvm_to_gen.cpp
> index c2c015a..b1dc686 100644
> --- a/backend/src/llvm/llvm_to_gen.cpp
> +++ b/backend/src/llvm/llvm_to_gen.cpp
> @@ -81,7 +81,9 @@ namespace gbe
>    {
>      FunctionPassManager FPM(&mod);
>  
> -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +    FPM.add(new DataLayoutPass());
> +#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
>      FPM.add(new DataLayoutPass(DL));
>  #else
>      FPM.add(new DataLayout(DL));
> @@ -112,7 +114,9 @@ namespace gbe
>    {
>      llvm::PassManager MPM;
>  
> -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +    MPM.add(new DataLayoutPass());
> +#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
>      MPM.add(new DataLayoutPass(DL));
>  #else
>      MPM.add(new DataLayout(DL));
> @@ -231,7 +235,11 @@ namespace gbe
>        cl_mod = reinterpret_cast<Module*>(const_cast<void*>(module));
>      } else if (fileName){
>        llvm::LLVMContext& c = llvm::getGlobalContext();
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +      cl_mod = parseIRFile(fileName, Err, c).release();
> +#else
>        cl_mod = ParseIRFile(fileName, Err, c);
> +#endif
>      }
>  
>      if (!cl_mod) return false;
> @@ -259,7 +267,9 @@ namespace gbe
>      runFuntionPass(mod, libraryInfo, DL);
>      runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
>      llvm::PassManager passes;
> -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +    passes.add(new DataLayoutPass());
> +#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
>      passes.add(new DataLayoutPass(DL));
>  #else
>      passes.add(new DataLayout(DL));
> diff --git a/backend/src/llvm/llvm_unroll.cpp 
> b/backend/src/llvm/llvm_unroll.cpp
> index 172e724..5d3fad8 100644
> --- a/backend/src/llvm/llvm_unroll.cpp
> +++ b/backend/src/llvm/llvm_unroll.cpp
> @@ -95,7 +95,11 @@ namespace gbe {
>            if (Name.equals(S->getString())) {
>              assert(MD->getNumOperands() == 2 &&
>                     "Unroll hint metadata should have two 
> operands.");
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +            return mdconst::extract<ConstantInt>(MD->getOperand(1));
> +#else
>              return cast<ConstantInt>(MD->getOperand(1));
> +#endif
>            }
>          }
>          return nullptr;
> @@ -105,6 +109,15 @@ namespace gbe {
>          if (!enable && disabledLoops.find(L) != disabledLoops.end())
>             return;
>          LLVMContext &Context = L->getHeader()->getContext();
> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> +        SmallVector<Metadata *, 2> forceUnroll;
> +        forceUnroll.push_back(MDString::get(Context, 
> "llvm.loop.unroll.enable"));
> +        
> forceUnroll.push_back(ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 
> enable)));
> +        MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
> +        SmallVector<Metadata *, 4> Vals;
> +        Vals.push_back(NULL);
> +        Vals.push_back(forceUnrollNode);
> +#else
>          SmallVector<Value *, 2> forceUnroll;
>          forceUnroll.push_back(MDString::get(Context, 
> "llvm.loop.unroll.enable"));
>          
> forceUnroll.push_back(ConstantInt::get(Type::getInt1Ty(Context), 
> enable));
> @@ -112,6 +125,7 @@ namespace gbe {
>          SmallVector<Value *, 4> Vals;
>          Vals.push_back(NULL);
>          Vals.push_back(forceUnrollNode);
> +#endif
>          MDNode *NewLoopID = MDNode::get(Context, Vals);
>          // Set operand 0 to refer to the loop id itself.
>          NewLoopID->replaceOperandWith(0, NewLoopID);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: This is a digitally signed message part
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20150301/59ddd885/attachment.sig>

From zhigang.gong at intel.com  Sun Mar  1 17:19:26 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Mon,  2 Mar 2015 09:19:26 +0800
Subject: [Beignet] [PATCH] GBE: remove the unecessary type check for SEL
	instructio.
Message-ID: <1425259166-12738-1-git-send-email-zhigang.gong@intel.com>

The backend SEL instruction could support bool type
since we change the bool representation to normal
S16 data type. Now let us remove this assertion
check.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/ir/instruction.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 928e365..1e1b040 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -874,7 +874,6 @@ namespace ir {
         if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
           return false;
       }
-      CHECK_TYPE(this->type, allButBool);
       return true;
     }
 
-- 
1.9.1


From zhigang.gong at intel.com  Sun Mar  1 20:29:15 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Mon,  2 Mar 2015 12:29:15 +0800
Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
Message-ID: <1425270555-26043-1-git-send-email-zhigang.gong@intel.com>

LLVM 3.6 may generate the following instructions:

%Pivot = icmp slt i1 %trunc49, false

when do siwth lowering pass.
To support it we must use GEN_TYPE_W to represent B rather
than GEN_TYPE_UW and we also need to remove the corresponding
assertions.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 4 ++--
 backend/src/ir/instruction.cpp             | 1 -
 backend/src/llvm/llvm_gen_backend.cpp      | 2 --
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 106ea82..d100f80 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -115,7 +115,7 @@ namespace gbe
   uint32_t getGenType(ir::Type type) {
     using namespace ir;
     switch (type) {
-      case TYPE_BOOL: return GEN_TYPE_UW;
+      case TYPE_BOOL: return GEN_TYPE_W;
       case TYPE_S8: return GEN_TYPE_B;
       case TYPE_U8: return GEN_TYPE_UB;
       case TYPE_S16: return GEN_TYPE_W;
@@ -1956,7 +1956,7 @@ namespace gbe
       case TYPE_U8:  return GenRegister::immuw(imm.getIntegerValue() * sign);
       case TYPE_S8:  return GenRegister::immw((int8_t)imm.getIntegerValue() * sign);
       case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() * sign);
-      case TYPE_BOOL: return GenRegister::immuw(-imm.getIntegerValue());  //return 0xffff when true
+      case TYPE_BOOL: return GenRegister::immw((imm.getIntegerValue() == 0) ? 0 : -1);  //return 0xffff when true
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 1e1b040..a2bc875 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -889,7 +889,6 @@ namespace ir {
       for (uint32_t srcID = 0; srcID < 2; ++srcID)
         if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
           return false;
-      CHECK_TYPE(this->type, allButBool);
       return true;
     }
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 43b50e7..238f572 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2151,8 +2151,6 @@ namespace gbe
   }
 
   void GenWriter::emitICmpInst(ICmpInst &I) {
-    GBE_ASSERT(I.getOperand(0)->getType() != Type::getInt1Ty(I.getContext()));
-
     // Get the element type and the number of elements
     Type *operandType = I.getOperand(0)->getType();
     const ir::Type type = getType(ctx, operandType);
-- 
1.9.1


From ruiling.song at intel.com  Sun Mar  1 23:47:54 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Mon, 2 Mar 2015 07:47:54 +0000
Subject: [Beignet] [PATCH] GBE: remove the unecessary type check for
	SEL	instructio.
In-Reply-To: <1425259166-12738-1-git-send-email-zhigang.gong@intel.com>
References: <1425259166-12738-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7DC0B7@SHSMSX101.ccr.corp.intel.com>

LGTM

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Monday, March 02, 2015 9:19 AM
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] GBE: remove the unecessary type check for SEL
> instructio.
> 
> The backend SEL instruction could support bool type since we change the
> bool representation to normal
> S16 data type. Now let us remove this assertion check.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/ir/instruction.cpp | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index 928e365..1e1b040 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -874,7 +874,6 @@ namespace ir {
>          if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) ==
> false))
>            return false;
>        }
> -      CHECK_TYPE(this->type, allButBool);
>        return true;
>      }
> 
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From rong.r.yang at intel.com  Mon Mar  2 00:10:32 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Mon, 2 Mar 2015 08:10:32 +0000
Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
In-Reply-To: <1425270555-26043-1-git-send-email-zhigang.gong@intel.com>
References: <1425270555-26043-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <7597C9376C272A4AB2D29E91550B7B0901404045@shsmsx102.ccr.corp.intel.com>

For the instruction:
%Pivot = icmp slt i1 %trunc49, false

And slt is: interprets the operands as signed values and yields true if op1 is less than op2.

But %trunc49 only has 1 bit, How to interprets %trunc49 as signed values? Only signed bit?

When %trunc49's value is true, what should be this instruction's result?

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Monday, March 2, 2015 12:29
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
> 
> LLVM 3.6 may generate the following instructions:
> 
> %Pivot = icmp slt i1 %trunc49, false
> 
> when do siwth lowering pass.
> To support it we must use GEN_TYPE_W to represent B rather than
> GEN_TYPE_UW and we also need to remove the corresponding assertions.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 4 ++--
>  backend/src/ir/instruction.cpp             | 1 -
>  backend/src/llvm/llvm_gen_backend.cpp      | 2 --
>  3 files changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 106ea82..d100f80 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -115,7 +115,7 @@ namespace gbe
>    uint32_t getGenType(ir::Type type) {
>      using namespace ir;
>      switch (type) {
> -      case TYPE_BOOL: return GEN_TYPE_UW;
> +      case TYPE_BOOL: return GEN_TYPE_W;
>        case TYPE_S8: return GEN_TYPE_B;
>        case TYPE_U8: return GEN_TYPE_UB;
>        case TYPE_S16: return GEN_TYPE_W; @@ -1956,7 +1956,7 @@
> namespace gbe
>        case TYPE_U8:  return GenRegister::immuw(imm.getIntegerValue() *
> sign);
>        case TYPE_S8:  return GenRegister::immw((int8_t)imm.getIntegerValue()
> * sign);
>        case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() *
> sign);
> -      case TYPE_BOOL: return GenRegister::immuw(-imm.getIntegerValue());
> //return 0xffff when true
> +      case TYPE_BOOL: return GenRegister::immw((imm.getIntegerValue()
> + == 0) ? 0 : -1);  //return 0xffff when true
>        default: NOT_SUPPORTED; return GenRegister::immuw(0);
>      }
>    }
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index 1e1b040..a2bc875 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -889,7 +889,6 @@ namespace ir {
>        for (uint32_t srcID = 0; srcID < 2; ++srcID)
>          if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
>            return false;
> -      CHECK_TYPE(this->type, allButBool);
>        return true;
>      }
> 
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 43b50e7..238f572 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2151,8 +2151,6 @@ namespace gbe
>    }
> 
>    void GenWriter::emitICmpInst(ICmpInst &I) {
> -    GBE_ASSERT(I.getOperand(0)->getType() !=
> Type::getInt1Ty(I.getContext()));
> -
>      // Get the element type and the number of elements
>      Type *operandType = I.getOperand(0)->getType();
>      const ir::Type type = getType(ctx, operandType);
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at intel.com  Mon Mar  2 00:13:37 2015
From: zhigang.gong at intel.com (Gong, Zhigang)
Date: Mon, 2 Mar 2015 08:13:37 +0000
Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
In-Reply-To: <7597C9376C272A4AB2D29E91550B7B0901404045@shsmsx102.ccr.corp.intel.com>
References: <1425270555-26043-1-git-send-email-zhigang.gong@intel.com>
 <7597C9376C272A4AB2D29E91550B7B0901404045@shsmsx102.ccr.corp.intel.com>
Message-ID: <A526C4BAF156B74EA0033E8D9A36E913011BF8BB@SHSMSX103.ccr.corp.intel.com>

That's the tricky part. And as I know, i1 1 is actually -1 thus is less than 0.
And you can see it is consistent when you extent i1 to i16 which is a signed extend.

> -----Original Message-----
> From: Yang, Rong R
> Sent: Monday, March 2, 2015 4:11 PM
> To: Gong, Zhigang; beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: RE: [Beignet] [PATCH] GBE: support compare two bool variables.
> 
> For the instruction:
> %Pivot = icmp slt i1 %trunc49, false
> 
> And slt is: interprets the operands as signed values and yields true if op1 is less
> than op2.
> 
> But %trunc49 only has 1 bit, How to interprets %trunc49 as signed values? Only
> signed bit?
> 
> When %trunc49's value is true, what should be this instruction's result?
> 
> > -----Original Message-----
> > From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf
> > Of Zhigang Gong
> > Sent: Monday, March 2, 2015 12:29
> > To: beignet at lists.freedesktop.org
> > Cc: Gong, Zhigang
> > Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
> >
> > LLVM 3.6 may generate the following instructions:
> >
> > %Pivot = icmp slt i1 %trunc49, false
> >
> > when do siwth lowering pass.
> > To support it we must use GEN_TYPE_W to represent B rather than
> > GEN_TYPE_UW and we also need to remove the corresponding assertions.
> >
> > Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> > ---
> >  backend/src/backend/gen_insn_selection.cpp | 4 ++--
> >  backend/src/ir/instruction.cpp             | 1 -
> >  backend/src/llvm/llvm_gen_backend.cpp      | 2 --
> >  3 files changed, 2 insertions(+), 5 deletions(-)
> >
> > diff --git a/backend/src/backend/gen_insn_selection.cpp
> > b/backend/src/backend/gen_insn_selection.cpp
> > index 106ea82..d100f80 100644
> > --- a/backend/src/backend/gen_insn_selection.cpp
> > +++ b/backend/src/backend/gen_insn_selection.cpp
> > @@ -115,7 +115,7 @@ namespace gbe
> >    uint32_t getGenType(ir::Type type) {
> >      using namespace ir;
> >      switch (type) {
> > -      case TYPE_BOOL: return GEN_TYPE_UW;
> > +      case TYPE_BOOL: return GEN_TYPE_W;
> >        case TYPE_S8: return GEN_TYPE_B;
> >        case TYPE_U8: return GEN_TYPE_UB;
> >        case TYPE_S16: return GEN_TYPE_W; @@ -1956,7 +1956,7 @@
> > namespace gbe
> >        case TYPE_U8:  return
> GenRegister::immuw(imm.getIntegerValue()
> > * sign);
> >        case TYPE_S8:  return
> > GenRegister::immw((int8_t)imm.getIntegerValue()
> > * sign);
> >        case TYPE_DOUBLE: return
> > GenRegister::immdf(imm.getDoubleValue() * sign);
> > -      case TYPE_BOOL: return
> GenRegister::immuw(-imm.getIntegerValue());
> > //return 0xffff when true
> > +      case TYPE_BOOL: return
> GenRegister::immw((imm.getIntegerValue()
> > + == 0) ? 0 : -1);  //return 0xffff when true
> >        default: NOT_SUPPORTED; return GenRegister::immuw(0);
> >      }
> >    }
> > diff --git a/backend/src/ir/instruction.cpp
> > b/backend/src/ir/instruction.cpp index 1e1b040..a2bc875 100644
> > --- a/backend/src/ir/instruction.cpp
> > +++ b/backend/src/ir/instruction.cpp
> > @@ -889,7 +889,6 @@ namespace ir {
> >        for (uint32_t srcID = 0; srcID < 2; ++srcID)
> >          if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) ==
> false))
> >            return false;
> > -      CHECK_TYPE(this->type, allButBool);
> >        return true;
> >      }
> >
> > diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> > b/backend/src/llvm/llvm_gen_backend.cpp
> > index 43b50e7..238f572 100644
> > --- a/backend/src/llvm/llvm_gen_backend.cpp
> > +++ b/backend/src/llvm/llvm_gen_backend.cpp
> > @@ -2151,8 +2151,6 @@ namespace gbe
> >    }
> >
> >    void GenWriter::emitICmpInst(ICmpInst &I) {
> > -    GBE_ASSERT(I.getOperand(0)->getType() !=
> > Type::getInt1Ty(I.getContext()));
> > -
> >      // Get the element type and the number of elements
> >      Type *operandType = I.getOperand(0)->getType();
> >      const ir::Type type = getType(ctx, operandType);
> > --
> > 1.9.1
> >
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet

From rong.r.yang at intel.com  Mon Mar  2 00:17:11 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Mon, 2 Mar 2015 08:17:11 +0000
Subject: [Beignet] [PATCH] Build: use -Bsymbolic to fix conflicts with
 other	LLVM users.
In-Reply-To: <1425023845-10956-1-git-send-email-zhigang.gong@intel.com>
References: <1425023845-10956-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090140406C@shsmsx102.ccr.corp.intel.com>

LGTM.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Friday, February 27, 2015 15:57
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] Build: use -Bsymbolic to fix conflicts with other
> LLVM users.
> 
> As there may be some other LLVM users such as mesa, and they may link to
> different LLVM library. To avoid such type of conflicts, we use -Bsymbolic to
> disable the symbol preemption.
> 
> This patch should fix the build bug at:
> https://bugs.freedesktop.org/show_bug.cgi?id=89325
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  CMakeLists.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt index b731973..88ff872 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -86,7 +86,7 @@ ELSE (USE_STANDALONE_GBE_COMPILER STREQUAL
> "true")  ENDIF (USE_STANDALONE_GBE_COMPILER STREQUAL "true")
> 
> 
> -set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined
> ${LLVM_LDFLAGS}")
> +set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}
> +-Wl,-Bsymbolic -Wl,--no-undefined ${LLVM_LDFLAGS}")
> 
>  # XLib
>  Find_Package(X11)
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at intel.com  Sun Mar  1 23:23:57 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Mon,  2 Mar 2015 15:23:57 +0800
Subject: [Beignet] [PATCH] GBE: add fastcall support.
Message-ID: <1425281037-22378-1-git-send-email-zhigang.gong@intel.com>

I found some optimization pass may add fastcall attribute to some
builtin functions. We need to add the corresponding support.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp   | 4 +++-
 backend/src/llvm/llvm_printf_parser.cpp | 1 +
 backend/src/llvm/llvm_scalarize.cpp     | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 238f572..aad638f 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2021,9 +2021,11 @@ namespace gbe
       case CallingConv::PTX_Kernel:
 #else
       case CallingConv::C:
+      case CallingConv::Fast:
 #endif
         break;
-      default: GBE_ASSERTM(false, "Unsupported calling convention");
+      default:
+        GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
     ctx.startFunction(F.getName());
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 52da2e5..8e662b3 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -564,6 +564,7 @@ error:
       case CallingConv::PTX_Kernel:
 #else
       case CallingConv::C:
+      case CallingConv::Fast:
 #endif
         break;
       default:
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 4df849f..97a7615 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -788,9 +788,11 @@ namespace gbe {
     case CallingConv::PTX_Kernel:
 #else
     case CallingConv::C:
+    case CallingConv::Fast:
 #endif
       break;
-    default: GBE_ASSERTM(false, "Unsupported calling convention");
+    default:
+      GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
     // As we inline all function calls, so skip non-kernel functions
-- 
1.9.1


From rong.r.yang at intel.com  Mon Mar  2 00:23:55 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Mon, 2 Mar 2015 08:23:55 +0000
Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
In-Reply-To: <A526C4BAF156B74EA0033E8D9A36E913011BF8BB@SHSMSX103.ccr.corp.intel.com>
References: <1425270555-26043-1-git-send-email-zhigang.gong@intel.com>
 <7597C9376C272A4AB2D29E91550B7B0901404045@shsmsx102.ccr.corp.intel.com>
 <A526C4BAF156B74EA0033E8D9A36E913011BF8BB@SHSMSX103.ccr.corp.intel.com>
Message-ID: <7597C9376C272A4AB2D29E91550B7B0901404085@shsmsx102.ccr.corp.intel.com>

OK, the patch is good to me.

> -----Original Message-----
> From: Gong, Zhigang
> Sent: Monday, March 2, 2015 16:14
> To: Yang, Rong R; beignet at lists.freedesktop.org
> Subject: RE: [Beignet] [PATCH] GBE: support compare two bool variables.
> 
> That's the tricky part. And as I know, i1 1 is actually -1 thus is less than 0.
> And you can see it is consistent when you extent i1 to i16 which is a signed
> extend.
> 
> > -----Original Message-----
> > From: Yang, Rong R
> > Sent: Monday, March 2, 2015 4:11 PM
> > To: Gong, Zhigang; beignet at lists.freedesktop.org
> > Cc: Gong, Zhigang
> > Subject: RE: [Beignet] [PATCH] GBE: support compare two bool variables.
> >
> > For the instruction:
> > %Pivot = icmp slt i1 %trunc49, false
> >
> > And slt is: interprets the operands as signed values and yields true
> > if op1 is less than op2.
> >
> > But %trunc49 only has 1 bit, How to interprets %trunc49 as signed
> > values? Only signed bit?
> >
> > When %trunc49's value is true, what should be this instruction's result?
> >
> > > -----Original Message-----
> > > From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On
> > > Behalf Of Zhigang Gong
> > > Sent: Monday, March 2, 2015 12:29
> > > To: beignet at lists.freedesktop.org
> > > Cc: Gong, Zhigang
> > > Subject: [Beignet] [PATCH] GBE: support compare two bool variables.
> > >
> > > LLVM 3.6 may generate the following instructions:
> > >
> > > %Pivot = icmp slt i1 %trunc49, false
> > >
> > > when do siwth lowering pass.
> > > To support it we must use GEN_TYPE_W to represent B rather than
> > > GEN_TYPE_UW and we also need to remove the corresponding
> assertions.
> > >
> > > Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> > > ---
> > >  backend/src/backend/gen_insn_selection.cpp | 4 ++--
> > >  backend/src/ir/instruction.cpp             | 1 -
> > >  backend/src/llvm/llvm_gen_backend.cpp      | 2 --
> > >  3 files changed, 2 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/backend/src/backend/gen_insn_selection.cpp
> > > b/backend/src/backend/gen_insn_selection.cpp
> > > index 106ea82..d100f80 100644
> > > --- a/backend/src/backend/gen_insn_selection.cpp
> > > +++ b/backend/src/backend/gen_insn_selection.cpp
> > > @@ -115,7 +115,7 @@ namespace gbe
> > >    uint32_t getGenType(ir::Type type) {
> > >      using namespace ir;
> > >      switch (type) {
> > > -      case TYPE_BOOL: return GEN_TYPE_UW;
> > > +      case TYPE_BOOL: return GEN_TYPE_W;
> > >        case TYPE_S8: return GEN_TYPE_B;
> > >        case TYPE_U8: return GEN_TYPE_UB;
> > >        case TYPE_S16: return GEN_TYPE_W; @@ -1956,7 +1956,7 @@
> > > namespace gbe
> > >        case TYPE_U8:  return
> > GenRegister::immuw(imm.getIntegerValue()
> > > * sign);
> > >        case TYPE_S8:  return
> > > GenRegister::immw((int8_t)imm.getIntegerValue()
> > > * sign);
> > >        case TYPE_DOUBLE: return
> > > GenRegister::immdf(imm.getDoubleValue() * sign);
> > > -      case TYPE_BOOL: return
> > GenRegister::immuw(-imm.getIntegerValue());
> > > //return 0xffff when true
> > > +      case TYPE_BOOL: return
> > GenRegister::immw((imm.getIntegerValue()
> > > + == 0) ? 0 : -1);  //return 0xffff when true
> > >        default: NOT_SUPPORTED; return GenRegister::immuw(0);
> > >      }
> > >    }
> > > diff --git a/backend/src/ir/instruction.cpp
> > > b/backend/src/ir/instruction.cpp index 1e1b040..a2bc875 100644
> > > --- a/backend/src/ir/instruction.cpp
> > > +++ b/backend/src/ir/instruction.cpp
> > > @@ -889,7 +889,6 @@ namespace ir {
> > >        for (uint32_t srcID = 0; srcID < 2; ++srcID)
> > >          if (UNLIKELY(checkRegisterData(family, src[srcID], fn,
> > > whyNot) ==
> > false))
> > >            return false;
> > > -      CHECK_TYPE(this->type, allButBool);
> > >        return true;
> > >      }
> > >
> > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> > > b/backend/src/llvm/llvm_gen_backend.cpp
> > > index 43b50e7..238f572 100644
> > > --- a/backend/src/llvm/llvm_gen_backend.cpp
> > > +++ b/backend/src/llvm/llvm_gen_backend.cpp
> > > @@ -2151,8 +2151,6 @@ namespace gbe
> > >    }
> > >
> > >    void GenWriter::emitICmpInst(ICmpInst &I) {
> > > -    GBE_ASSERT(I.getOperand(0)->getType() !=
> > > Type::getInt1Ty(I.getContext()));
> > > -
> > >      // Get the element type and the number of elements
> > >      Type *operandType = I.getOperand(0)->getType();
> > >      const ir::Type type = getType(ctx, operandType);
> > > --
> > > 1.9.1
> > >
> > > _______________________________________________
> > > Beignet mailing list
> > > Beignet at lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/beignet

From rong.r.yang at intel.com  Mon Mar  2 00:34:12 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Mon, 2 Mar 2015 08:34:12 +0000
Subject: [Beignet] [PATCH] GBE: add fastcall support.
In-Reply-To: <1425281037-22378-1-git-send-email-zhigang.gong@intel.com>
References: <1425281037-22378-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <7597C9376C272A4AB2D29E91550B7B09014040A8@shsmsx102.ccr.corp.intel.com>

LGTM, thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Monday, March 2, 2015 15:24
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] GBE: add fastcall support.
> 
> I found some optimization pass may add fastcall attribute to some builtin
> functions. We need to add the corresponding support.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/llvm/llvm_gen_backend.cpp   | 4 +++-
>  backend/src/llvm/llvm_printf_parser.cpp | 1 +
>  backend/src/llvm/llvm_scalarize.cpp     | 4 +++-
>  3 files changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 238f572..aad638f 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2021,9 +2021,11 @@ namespace gbe
>        case CallingConv::PTX_Kernel:
>  #else
>        case CallingConv::C:
> +      case CallingConv::Fast:
>  #endif
>          break;
> -      default: GBE_ASSERTM(false, "Unsupported calling convention");
> +      default:
> +        GBE_ASSERTM(false, "Unsupported calling convention");
>      }
> 
>      ctx.startFunction(F.getName());
> diff --git a/backend/src/llvm/llvm_printf_parser.cpp
> b/backend/src/llvm/llvm_printf_parser.cpp
> index 52da2e5..8e662b3 100644
> --- a/backend/src/llvm/llvm_printf_parser.cpp
> +++ b/backend/src/llvm/llvm_printf_parser.cpp
> @@ -564,6 +564,7 @@ error:
>        case CallingConv::PTX_Kernel:
>  #else
>        case CallingConv::C:
> +      case CallingConv::Fast:
>  #endif
>          break;
>        default:
> diff --git a/backend/src/llvm/llvm_scalarize.cpp
> b/backend/src/llvm/llvm_scalarize.cpp
> index 4df849f..97a7615 100644
> --- a/backend/src/llvm/llvm_scalarize.cpp
> +++ b/backend/src/llvm/llvm_scalarize.cpp
> @@ -788,9 +788,11 @@ namespace gbe {
>      case CallingConv::PTX_Kernel:
>  #else
>      case CallingConv::C:
> +    case CallingConv::Fast:
>  #endif
>        break;
> -    default: GBE_ASSERTM(false, "Unsupported calling convention");
> +    default:
> +      GBE_ASSERTM(false, "Unsupported calling convention");
>      }
> 
>      // As we inline all function calls, so skip non-kernel functions
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From jeff.mcgee at intel.com  Mon Mar  2 15:37:32 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  2 Mar 2015 15:37:32 -0800
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
Message-ID: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Setup new I915_GETPARAM ioctl entries for subslice total and
EU total. Userspace drivers need these values when constructing
GPGPU commands. This kernel query method is intended to replace
the PCI ID-based tables that userspace drivers currently maintain.
The kernel driver can employ fuse register reads as needed to
ensure the most accurate determination of GT config attributes.
This first became important with Cherryview in which the config
could differ between devices with the same PCI ID.

The kernel detection of these values is device-specific and not
included in this patch. Because zero is not a valid value for any of
these parameters, a value of zero is interpreted as unknown for the
device. Userspace drivers should continue to maintain ID-based tables
for older devices not supported by the new query method.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
 include/uapi/drm/i915_drm.h     |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 053e178..9350ea2 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_MMAP_VERSION:
 		value = 1;
 		break;
+	case I915_PARAM_SUBSLICE_TOTAL:
+		value = INTEL_INFO(dev)->subslice_total;
+		if (!value)
+			return -ENODEV;
+		break;
+	case I915_PARAM_EU_TOTAL:
+		value = INTEL_INFO(dev)->eu_total;
+		if (!value)
+			return -ENODEV;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6eed16b..8672efc 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -347,6 +347,8 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
 #define I915_PARAM_MMAP_VERSION          30
 #define I915_PARAM_HAS_BSD2		 31
+#define I915_PARAM_SUBSLICE_TOTAL	 32
+#define I915_PARAM_EU_TOTAL		 33
 
 typedef struct drm_i915_getparam {
 	int param;
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  2 15:39:27 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  2 Mar 2015 15:39:27 -0800
Subject: [Beignet] [PATCH] intel: Export total subslice and EU counts
Message-ID: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Update kernel interface with new I915_GETPARAM ioctl entries for
subslice total and EU total. Add a wrapping function for each
parameter. Userspace drivers need these values when constructing
GPGPU commands. This kernel query method is intended to replace
the PCI ID-based tables that userspace drivers currently maintain.
The kernel driver can employ fuse register reads as needed to
ensure the most accurate determination of GT config attributes.
This first became important with Cherryview in which the config
could differ between devices with the same PCI ID.

The kernel detection of these values is device-specific. Userspace
drivers should continue to maintain ID-based tables for older
devices which return ENODEV when using this query.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 include/drm/i915_drm.h   |  2 ++
 intel/intel_bufmgr.h     |  4 ++++
 intel/intel_bufmgr_gem.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index 15dd01d..e34f5b2 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -340,6 +340,8 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
 #define I915_PARAM_HAS_WT     	 	 27
 #define I915_PARAM_CMD_PARSER_VERSION	 28
+#define I915_PARAM_SUBSLICE_TOTAL	 32
+#define I915_PARAM_EU_TOTAL		 33
 
 typedef struct drm_i915_getparam {
 	int param;
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index be83a56..4b2472e 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -37,6 +37,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <stdbool.h>
 
 struct drm_clip_rect;
 
@@ -264,6 +265,9 @@ int drm_intel_get_reset_stats(drm_intel_context *ctx,
 			      uint32_t *active,
 			      uint32_t *pending);
 
+int drm_intel_get_subslice_total(int fd, unsigned int *subslice_total);
+int drm_intel_get_eu_total(int fd, unsigned int *eu_total);
+
 /** @{ Compatibility defines to keep old code building despite the symbol rename
  * from dri_* to drm_intel_*
  */
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 78875fd..2d77f32 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -3292,6 +3292,37 @@ drm_intel_reg_read(drm_intel_bufmgr *bufmgr,
 	return ret;
 }
 
+drm_public int
+drm_intel_get_subslice_total(int fd, unsigned int *subslice_total)
+{
+	drm_i915_getparam_t gp;
+	int ret;
+
+	memclear(gp);
+	gp.value = (int*)subslice_total;
+	gp.param = I915_PARAM_SUBSLICE_TOTAL;
+	ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
+	if (ret)
+		return -errno;
+
+	return 0;
+}
+
+drm_public int
+drm_intel_get_eu_total(int fd, unsigned int *eu_total)
+{
+	drm_i915_getparam_t gp;
+	int ret;
+
+	memclear(gp);
+	gp.value = (int*)eu_total;
+	gp.param = I915_PARAM_EU_TOTAL;
+	ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
+	if (ret)
+		return -errno;
+
+	return 0;
+}
 
 /**
  * Annotate the given bo for use in aub dumping.
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  2 15:40:42 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  2 Mar 2015 15:40:42 -0800
Subject: [Beignet] [PATCH] tests/core_getparams: Create new test
	core_getparams
Message-ID: <1425339642-18988-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

New test core_getparams consists of 2 subtests, each one testing
the ability of userspace to query the correct value of a GT config
attribute: subslice total or EU total. drm/i915 implementation of
these queries is required for Cherryview and Gen9+ devices (non-
simulated).

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 tests/.gitignore       |   1 +
 tests/Makefile.sources |   1 +
 tests/core_getparams.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 tests/core_getparams.c

diff --git a/tests/.gitignore b/tests/.gitignore
index 7b4dd94..39b4e28 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,6 +1,7 @@
 # Please keep sorted alphabetically
 core_get_client_auth
 core_getclient
+core_getparams
 core_getstats
 core_getversion
 drm_import_export
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 51e8376..999c8f8 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
 
 TESTS_progs_M = \
 	core_get_client_auth \
+	core_getparams \
 	drv_suspend \
 	drv_hangman \
 	gem_bad_reloc \
diff --git a/tests/core_getparams.c b/tests/core_getparams.c
new file mode 100644
index 0000000..37a4f63
--- /dev/null
+++ b/tests/core_getparams.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jeff McGee <jeff.mcgee at intel.com>
+ *
+ */
+
+#include <unistd.h>
+#include <errno.h>
+#include <xf86drm.h>
+#include "drmtest.h"
+#include "intel_chipset.h"
+#include "intel_bufmgr.h"
+
+int drm_fd;
+int devid;
+
+static void
+init(void)
+{
+	drm_fd = drm_open_any();
+	devid = intel_get_drm_devid(drm_fd);
+}
+
+static void
+deinit(void)
+{
+	close(drm_fd);
+}
+
+static void
+subslice_total(void)
+{
+	unsigned int subslice_total = 0;
+	int ret;
+
+	ret = drm_intel_get_subslice_total(drm_fd, &subslice_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert(ret == -ENODEV);
+			igt_info("subslice total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert(ret != EINVAL); /* request not recognized? */
+			igt_assert(ret != ENODEV); /* device not supported? */
+			igt_assert(ret == 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert(subslice_total != 0);
+		igt_info("subslice total: %u\n", subslice_total);
+	}
+}
+
+static void
+eu_total(void)
+{
+	unsigned int eu_total = 0;
+	int ret;
+
+	ret = drm_intel_get_eu_total(drm_fd, &eu_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert(ret == -ENODEV);
+			igt_info("EU total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert(ret != EINVAL); /* request not recognized? */
+			igt_assert(ret != ENODEV); /* device not supported? */
+			igt_assert(ret == 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert(eu_total != 0);
+		igt_info("EU total: %u\n", eu_total);
+	}
+}
+
+static void
+exit_handler(int sig)
+{
+	deinit();
+}
+
+igt_main
+{
+	igt_fixture {
+		igt_install_exit_handler(exit_handler);
+		init();
+	}
+
+	igt_subtest("subslice-total")
+		subslice_total();
+
+	igt_subtest("eu-total")
+		eu_total();
+}
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  2 15:42:38 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  2 Mar 2015 15:42:38 -0800
Subject: [Beignet] [PATCH 1/2] Add driver callback for updating device info
Message-ID: <1425339759-19027-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

We need to update some fields of the device's cl_device_id
struct at runtime using driver-specific methods. It is best to
group all such updates into a single driver callback to avoid
opening/initing and deiniting/closing the device multiple times.

Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 src/cl_device_id.c       | 20 ++------------------
 src/cl_driver.h          |  4 ++++
 src/cl_driver_defs.c     |  1 +
 src/intel/intel_driver.c | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 4e01c9f..fefcef3 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -506,24 +506,8 @@ skl_gt4_break:
     ret->profile_sz = strlen(ret->profile) + 1;
   }
 
-#ifdef HAS_USERPTR
-  cl_driver dummy = cl_driver_new(NULL);
-  cl_buffer_mgr bufmgr = cl_driver_get_bufmgr(dummy);
-
-  const size_t sz = 4096;
-  void* host_ptr = cl_aligned_malloc(sz, 4096);;
-  if (host_ptr != NULL) {
-    cl_buffer bo = cl_buffer_alloc_userptr(bufmgr, "CL memory object", host_ptr, sz, 0);
-    if (bo == NULL)
-      ret->host_unified_memory = CL_FALSE;
-    else
-      cl_buffer_unreference(bo);
-    cl_free(host_ptr);
-  }
-  else
-    ret->host_unified_memory = CL_FALSE;
-  cl_driver_delete(dummy);
-#endif
+  /* Apply any driver-dependent updates to the device info */
+  cl_driver_update_device_info(ret);
 
   struct sysinfo info;
   if (sysinfo(&info) == 0) {
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 16f8bba..3f54a27 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -376,6 +376,10 @@ extern cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align;
 typedef int (cl_driver_get_device_id_cb)(void);
 extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
 
+/* Update the device info */
+typedef void (cl_driver_update_device_info_cb)(cl_device_id device);
+extern cl_driver_update_device_info_cb *cl_driver_update_device_info;
+
 /**************************************************************************
  * cl_khr_gl_sharing.
  **************************************************************************/
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 2b68539..9a47210 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -26,6 +26,7 @@ LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
 LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
 LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
 LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
+LOCAL cl_driver_update_device_info_cb *cl_driver_update_device_info = NULL;
 
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index ff0cf27..d61988c 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -754,6 +754,41 @@ static int intel_buffer_set_tiling(cl_buffer bo,
   return ret;
 }
 
+static void
+intel_update_device_info(cl_device_id device)
+{
+#ifdef HAS_USERPTR
+  intel_driver_t *driver;
+  const size_t sz = 4096;
+  void *host_ptr;
+
+  driver = intel_driver_new();
+  assert(driver != NULL);
+  if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
+    intel_driver_delete(driver);
+    return;
+  }
+
+  host_ptr = cl_aligned_malloc(sz, 4096);
+  if (host_ptr != NULL) {
+    cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
+      "CL memory object", host_ptr, sz, 0);
+    if (bo == NULL)
+      device->host_unified_memory = CL_FALSE;
+    else
+      drm_intel_bo_unreference((drm_intel_bo*)bo);
+    cl_free(host_ptr);
+  }
+  else
+    device->host_unified_memory = CL_FALSE;
+
+  intel_driver_context_destroy(driver);
+  intel_driver_close(driver);
+  intel_driver_terminate(driver);
+  intel_driver_delete(driver);
+#endif
+}
+
 LOCAL void
 intel_setup_callbacks(void)
 {
@@ -762,6 +797,7 @@ intel_setup_callbacks(void)
   cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
   cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
+  cl_driver_update_device_info = (cl_driver_update_device_info_cb *) intel_update_device_info;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
   cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  2 15:42:39 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  2 Mar 2015 15:42:39 -0800
Subject: [Beignet] [PATCH 2/2] Query the driver directly for compute units
	and subslice
In-Reply-To: <1425339759-19027-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339759-19027-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425339759-19027-2-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Values of device max compute units and max subslice obtained
directly from the driver should be more accurate than our own
ID-based lookup values. This is particularly important when a
single device ID may encompass more than one configuration. If
the driver cannot provide a valid value for the given device,
we fallback on the ID-based lookup value.

Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 src/intel/intel_driver.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index d61988c..d99fea9 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -757,10 +757,8 @@ static int intel_buffer_set_tiling(cl_buffer bo,
 static void
 intel_update_device_info(cl_device_id device)
 {
-#ifdef HAS_USERPTR
   intel_driver_t *driver;
-  const size_t sz = 4096;
-  void *host_ptr;
+  unsigned int eu_total, subslice_total;
 
   driver = intel_driver_new();
   assert(driver != NULL);
@@ -769,6 +767,10 @@ intel_update_device_info(cl_device_id device)
     return;
   }
 
+#ifdef HAS_USERPTR
+  const size_t sz = 4096;
+  void *host_ptr;
+
   host_ptr = cl_aligned_malloc(sz, 4096);
   if (host_ptr != NULL) {
     cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
@@ -781,12 +783,18 @@ intel_update_device_info(cl_device_id device)
   }
   else
     device->host_unified_memory = CL_FALSE;
+#endif
+
+  /* Prefer driver-queried value if supported */
+  if (!drm_intel_get_eu_total(driver->fd, &eu_total))
+    device->max_compute_unit = eu_total;
+  if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
+    device->sub_slice_count = subslice_total;
 
   intel_driver_context_destroy(driver);
   intel_driver_close(driver);
   intel_driver_terminate(driver);
   intel_driver_delete(driver);
-#endif
 }
 
 LOCAL void
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  2 17:26:10 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Mon, 2 Mar 2015 17:26:10 -0800
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150303012610.GA3263@jeffdesk>

On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> Setup new I915_GETPARAM ioctl entries for subslice total and
> EU total. Userspace drivers need these values when constructing
> GPGPU commands. This kernel query method is intended to replace
> the PCI ID-based tables that userspace drivers currently maintain.
> The kernel driver can employ fuse register reads as needed to
> ensure the most accurate determination of GT config attributes.
> This first became important with Cherryview in which the config
> could differ between devices with the same PCI ID.
> 
> The kernel detection of these values is device-specific and not
> included in this patch. Because zero is not a valid value for any of
> these parameters, a value of zero is interpreted as unknown for the
> device. Userspace drivers should continue to maintain ID-based tables
> for older devices not supported by the new query method.
> 

We already have total EU detection support for Cherryview but we
need to add detection of total subslice. That support is included
in the below-linked series which has been reviewed but not yet
merged.

http://lists.freedesktop.org/archives/intel-gfx/2015-February/060945.html

Jeff

From rong.r.yang at intel.com  Mon Mar  2 20:53:43 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Tue,  3 Mar 2015 12:53:43 +0800
Subject: [Beignet] [PATCH] Fix llvm3.6 build error.
Message-ID: <1425358423-19565-1-git-send-email-rong.r.yang@intel.com>

LLVM3.6 revert the c api LLVMLinkModules to LLVM3.5 last-minute. Consist with it.
---
 backend/src/backend/gen_program.cpp    | 4 ----
 backend/src/llvm/llvm_bitcode_link.cpp | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 2525825..755c60e 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -388,11 +388,7 @@ namespace gbe {
       llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
       llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
       if (LLVMLinkModules(wrap(dst), wrap(src), LLVMLinkerPreserveSource, &errMsg)) {
-#else
-      if (LLVMLinkModules(wrap(dst), wrap(src), 0, &errMsg)) {
-#endif
         if (err != NULL && errSize != NULL && stringSize > 0u) {
           strncpy(err, errMsg, stringSize-1);
           err[stringSize-1] = '\0';
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 96f7781..17248c0 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -240,11 +240,7 @@ namespace gbe
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
     char* errorMsg;
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
     if(LLVMLinkModules(wrap(clonedLib), wrap(mod), LLVMLinkerDestroySource, &errorMsg)) {
-#else
-    if(LLVMLinkModules(wrap(clonedLib), wrap(mod), 0, &errorMsg)) {
-#endif
       delete clonedLib;
       printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
       return NULL;
-- 
2.1.0


From rong.r.yang at intel.com  Mon Mar  2 21:00:38 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Tue, 3 Mar 2015 05:00:38 +0000
Subject: [Beignet] [PATCH 2/3] Add llvm3.6 build support.
In-Reply-To: <1425222563.26181.4.camel@snewbury.org.uk>
References: <1423729781-17667-1-git-send-email-rong.r.yang@intel.com>
 <1423729781-17667-2-git-send-email-rong.r.yang@intel.com>
 <1425222563.26181.4.camel@snewbury.org.uk>
Message-ID: <7597C9376C272A4AB2D29E91550B7B0901404213@shsmsx102.ccr.corp.intel.com>

Yes, LLVM3.6 change the api LLVMLinkModules at last, I have sent a patch "Fix llvm3.6 build error." for it.
Thanks for reporting.

> -----Original Message-----
> From: Steven Newbury [mailto:steve at snewbury.org.uk]
> Sent: Sunday, March 1, 2015 23:09
> To: Yang, Rong R
> Cc: beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH 2/3] Add llvm3.6 build support.
> 
> I now have another failure to build with llvm3.6 (release) not sure if it's an
> LLVM or flag change.  It would build with "-fpermissive".  See in-line.
> 
> On Thu, 2015-02-12 at 16:29 +0800, Yang Rong wrote:
> > There are some changes from llvm3.5:
> > 1. Some functions return std::unique_ptr instead of pointer.
> > 2. MetaNode to Value and Value to MetaNode.
> >
> > V2: Fix llvm3.5 build error.
> > V3: Print link and function materialize message.
> > Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> > ---
> >  backend/src/backend/gen_program.cpp    | 10 ++++++++++
> >  backend/src/backend/program.cpp        |  4 ++++
> >  backend/src/llvm/llvm_bitcode_link.cpp | 25
> > ++++++++++++++++++++++++-
> >  backend/src/llvm/llvm_gen_backend.cpp  | 22
> +++++++++++++++++++++-
> >  backend/src/llvm/llvm_passes.cpp       |  4 ++++
> >  backend/src/llvm/llvm_to_gen.cpp       | 16 +++++++++++++---
> >  backend/src/llvm/llvm_unroll.cpp       | 14 ++++++++++++++
> >  7 files changed, 90 insertions(+), 5 deletions(-)
> >
> > diff --git a/backend/src/backend/gen_program.cpp
> > b/backend/src/backend/gen_program.cpp
> > index a4019fe..65a7ba2 100644
> > --- a/backend/src/backend/gen_program.cpp
> > +++ b/backend/src/backend/gen_program.cpp
> > @@ -252,9 +252,15 @@ namespace gbe {
> >      llvm::StringRef llvm_bin_str(binary_content);
> >      llvm::LLVMContext& c = llvm::getGlobalContext();
> >      llvm::SMDiagnostic Err;
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +    std::unique_ptr<llvm::MemoryBuffer> memory_buffer =
> > llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
> > +    acquireLLVMContextLock();
> > +    llvm::Module* module = llvm::parseIR(memory_buffer-
> > >getMemBufferRef(), Err, c).release();
> > +#else
> >      llvm::MemoryBuffer* memory_buffer =
> > llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
> >      acquireLLVMContextLock();
> >      llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
> > +#endif
> >      releaseLLVMContextLock();
> >      if(module == NULL){
> >        GBE_ASSERT(0);
> > @@ -382,7 +388,11 @@ namespace gbe {
> >        llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)-
> > >module;
> >        llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)-
> > >module;
> >
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >        if (LLVMLinkModules(wrap(dst), wrap(src),
> > LLVMLinkerPreserveSource, &errMsg)) {
> > +#else
> > +      if (LLVMLinkModules(wrap(dst), wrap(src), 0, &errMsg)) {
> 
> I get a failure to convert from int to LLVMLinkerMode from the 3rd argument
> here.  The constant '0' needs to be cast to LLVMLinkerMode to keep
> LLVMLinkModules() happy since it doesn't accept an int argument.
> 
> > +#endif
> >          if (err != NULL && errSize != NULL && stringSize > 0u) {
> >            if(strlen(errMsg) < stringSize )
> >              stringSize = strlen(errMsg); diff --git
> > a/backend/src/backend/program.cpp
> b/backend/src/backend/program.cpp
> > index 38ce9c8..06810bd 100644
> > --- a/backend/src/backend/program.cpp
> > +++ b/backend/src/backend/program.cpp
> > @@ -621,7 +621,11 @@ namespace gbe {
> >      if (!retVal)
> >        return false;
> >
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >      llvm::Module *module = Act->takeModule();
> > +#else
> > +    llvm::Module *module = Act->takeModule().release(); #endif
> >
> >      *out_module = module;
> >      return true;
> > diff --git a/backend/src/llvm/llvm_bitcode_link.cpp
> > b/backend/src/llvm/llvm_bitcode_link.cpp
> > index 8eb6dd5..229e3bb 100644
> > --- a/backend/src/llvm/llvm_bitcode_link.cpp
> > +++ b/backend/src/llvm/llvm_bitcode_link.cpp
> > @@ -63,7 +63,11 @@ namespace gbe
> >      }
> >      assert(findBC);
> >
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >      oclLib = getLazyIRFileModule(FilePath, Err, ctx);
> > +#else
> > +    oclLib = getLazyIRFileModule(FilePath, Err, ctx).release();
> > +#endif
> >      if (!oclLib) {
> >        printf("Fatal Error: ocl lib can not be opened\n");
> >        return NULL;
> > @@ -114,12 +118,18 @@ namespace gbe
> >
> >          std::string ErrInfo;// = "Not Materializable";
> >          if (!fromSrc && newMF->isMaterializable()) {
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >            if (newMF->Materialize(&ErrInfo)) {
> >              printf("Can not materialize the function: %s, because
> > %s\n", fnName.c_str(), ErrInfo.c_str());
> >              return false;
> >            }
> > +#else
> > +          if (std::error_code EC = newMF->materialize()) {
> > +            printf("Can not materialize the function: %s, because
> > %s\n", fnName.c_str(), EC.message().c_str());
> > +            return false;
> > +          }
> > +#endif
> >          }
> > -
> >          if (!materializedFuncCall(src, lib, *newMF, MFS))
> >            return false;
> >
> > @@ -205,12 +215,21 @@ namespace gbe
> >        }
> >        std::string ErrInfo;// = "Not Materializable";
> >        if (newMF->isMaterializable()) {
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >          if (newMF->Materialize(&ErrInfo)) {
> >            printf("Can not materialize the function: %s, because
> > %s\n", fnName.c_str(), ErrInfo.c_str());
> >            delete clonedLib;
> >            return NULL;
> >          }
> >        }
> > +#else
> > +        if (std::error_code EC = newMF->materialize()) {
> > +          printf("Can not materialize the function: %s, because
> > %s\n", fnName.c_str(), EC.message().c_str();
> > +          delete clonedLib;
> > +          return NULL;
> > +        }
> > +      }
> > +#endif
> >
> >        if (!materializedFuncCall(*mod, *clonedLib, *newMF,
> > materializedFuncs)) {
> >          delete clonedLib;
> > @@ -223,7 +242,11 @@ namespace gbe
> >      /* We use beignet's bitcode as dst because it will have a lot of
> >         lazy functions which will not be loaded. */
> >      char* errorMsg;
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >      if(LLVMLinkModules(wrap(clonedLib), wrap(mod),
> > LLVMLinkerDestroySource, &errorMsg)) {
> > +#else
> > +    if(LLVMLinkModules(wrap(clonedLib), wrap(mod), 0, &errorMsg)) {
> 
> Same here.
> 
> > +#endif
> >        delete clonedLib;
> >        printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
> >        return NULL;
> > diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> > b/backend/src/llvm/llvm_gen_backend.cpp
> > index d47721a..c67a880 100644
> > --- a/backend/src/llvm/llvm_gen_backend.cpp
> > +++ b/backend/src/llvm/llvm_gen_backend.cpp
> > @@ -1467,7 +1467,12 @@ error:
> >      /* First find the meta data belong to this function. */
> >      for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
> >        node = clKernelMetaDatas->getOperand(i);
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >        if (node->getOperand(0) == &F) break;
> > +#else
> > +      auto *V = cast<ValueAsMetadata>(node->getOperand(0));
> > +      if (V && V->getValue() == &F) break; #endif
> >        node = NULL;
> >      }
> >
> > @@ -1484,9 +1489,15 @@ error:
> >
> >        if (attrName->getString() == "reqd_work_group_size") {
> >          GBE_ASSERT(attrNode->getNumOperands() == 4);
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >          ConstantInt *x = dyn_cast<ConstantInt>(attrNode-
> > >getOperand(1));
> >          ConstantInt *y = dyn_cast<ConstantInt>(attrNode-
> > >getOperand(2));
> >          ConstantInt *z = dyn_cast<ConstantInt>(attrNode-
> > >getOperand(3));
> > +#else
> > +        ConstantInt *x = mdconst::extract<ConstantInt>(attrNode-
> > >getOperand(1));
> > +        ConstantInt *y = mdconst::extract<ConstantInt>(attrNode-
> > >getOperand(2));
> > +        ConstantInt *z = mdconst::extract<ConstantInt>(attrNode-
> > >getOperand(3));
> > +#endif
> >          GBE_ASSERT(x && y && z);
> >          reqd_wg_sz[0] = x->getZExtValue();
> >          reqd_wg_sz[1] = y->getZExtValue(); @@ -1521,9 +1532,15 @@
> > error:
> >          functionAttributes += " ";
> >        } else if (attrName->getString() == "work_group_size_hint") {
> >          GBE_ASSERT(attrNode->getNumOperands() == 4);
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >          ConstantInt *x = dyn_cast<ConstantInt>(attrNode-
> > >getOperand(1));
> >          ConstantInt *y = dyn_cast<ConstantInt>(attrNode-
> > >getOperand(2));
> >          ConstantInt *z = dyn_cast<ConstantInt>(attrNode-
> > >getOperand(3));
> > +#else
> > +        ConstantInt *x = mdconst::extract<ConstantInt>(attrNode-
> > >getOperand(1));
> > +        ConstantInt *y = mdconst::extract<ConstantInt>(attrNode-
> > >getOperand(2));
> > +        ConstantInt *z = mdconst::extract<ConstantInt>(attrNode-
> > >getOperand(3));
> > +#endif
> >          GBE_ASSERT(x && y && z);
> >          hint_wg_sz[0] = x->getZExtValue();
> >          hint_wg_sz[1] = y->getZExtValue(); @@ -1561,8 +1578,11 @@
> > error:
> >        for (; I != E; ++I, ++argID) {
> >          const std::string &argName = I->getName().str();
> >          Type *type = I->getType();
> > -
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >          llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode-
> > >getOperand(1 + argID)))->getZExtValue();
> > +#else
> > +        llvmInfo.addrSpace =
> > (mdconst::extract<ConstantInt>(addrSpaceNode->getOperand(1 +
> > argID)))->getZExtValue();
> > +#endif
> >          llvmInfo.typeName = (cast<MDString>(typeNameNode-
> > >getOperand(1 + argID)))->getString();
> >          llvmInfo.accessQual = (cast<MDString>(accessQualNode-
> > >getOperand(1 + argID)))->getString();
> >          llvmInfo.typeQual = (cast<MDString>(typeQualNode-
> > >getOperand(1 + argID)))->getString();
> > diff --git a/backend/src/llvm/llvm_passes.cpp
> > b/backend/src/llvm/llvm_passes.cpp
> > index 5c0a2e0..1b0e4f8 100644
> > --- a/backend/src/llvm/llvm_passes.cpp
> > +++ b/backend/src/llvm/llvm_passes.cpp
> > @@ -119,7 +119,11 @@ namespace gbe
> >        uint32_t ops = md.getNumOperands();
> >        for(uint32_t x = 0; x < ops; x++) {
> >          MDNode* node = md.getOperand(x);
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
> >          Value * op = node->getOperand(0);
> > +#else
> > +        Value * op = cast<ValueAsMetadata>(node->getOperand(0))-
> > >getValue();
> > +#endif
> >          if(op == &F) bKernel = true;
> >        }
> >      }
> > diff --git a/backend/src/llvm/llvm_to_gen.cpp
> > b/backend/src/llvm/llvm_to_gen.cpp
> > index c2c015a..b1dc686 100644
> > --- a/backend/src/llvm/llvm_to_gen.cpp
> > +++ b/backend/src/llvm/llvm_to_gen.cpp
> > @@ -81,7 +81,9 @@ namespace gbe
> >    {
> >      FunctionPassManager FPM(&mod);
> >
> > -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +    FPM.add(new DataLayoutPass());
> > +#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
> >      FPM.add(new DataLayoutPass(DL));
> >  #else
> >      FPM.add(new DataLayout(DL));
> > @@ -112,7 +114,9 @@ namespace gbe
> >    {
> >      llvm::PassManager MPM;
> >
> > -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +    MPM.add(new DataLayoutPass());
> > +#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
> >      MPM.add(new DataLayoutPass(DL));
> >  #else
> >      MPM.add(new DataLayout(DL));
> > @@ -231,7 +235,11 @@ namespace gbe
> >        cl_mod = reinterpret_cast<Module*>(const_cast<void*>(module));
> >      } else if (fileName){
> >        llvm::LLVMContext& c = llvm::getGlobalContext();
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +      cl_mod = parseIRFile(fileName, Err, c).release(); #else
> >        cl_mod = ParseIRFile(fileName, Err, c);
> > +#endif
> >      }
> >
> >      if (!cl_mod) return false;
> > @@ -259,7 +267,9 @@ namespace gbe
> >      runFuntionPass(mod, libraryInfo, DL);
> >      runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
> >      llvm::PassManager passes;
> > -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +    passes.add(new DataLayoutPass()); #elif LLVM_VERSION_MAJOR == 3
> > +&& LLVM_VERSION_MINOR == 5
> >      passes.add(new DataLayoutPass(DL));  #else
> >      passes.add(new DataLayout(DL));
> > diff --git a/backend/src/llvm/llvm_unroll.cpp
> > b/backend/src/llvm/llvm_unroll.cpp
> > index 172e724..5d3fad8 100644
> > --- a/backend/src/llvm/llvm_unroll.cpp
> > +++ b/backend/src/llvm/llvm_unroll.cpp
> > @@ -95,7 +95,11 @@ namespace gbe {
> >            if (Name.equals(S->getString())) {
> >              assert(MD->getNumOperands() == 2 &&
> >                     "Unroll hint metadata should have two operands.");
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +            return mdconst::extract<ConstantInt>(MD->getOperand(1));
> > +#else
> >              return cast<ConstantInt>(MD->getOperand(1));
> > +#endif
> >            }
> >          }
> >          return nullptr;
> > @@ -105,6 +109,15 @@ namespace gbe {
> >          if (!enable && disabledLoops.find(L) != disabledLoops.end())
> >             return;
> >          LLVMContext &Context = L->getHeader()->getContext();
> > +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
> > +        SmallVector<Metadata *, 2> forceUnroll;
> > +        forceUnroll.push_back(MDString::get(Context,
> > "llvm.loop.unroll.enable"));
> > +
> > forceUnroll.push_back(ConstantAsMetadata::get(ConstantInt::get(Type::g
> > etInt1Ty(Context),
> > enable)));
> > +        MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
> > +        SmallVector<Metadata *, 4> Vals;
> > +        Vals.push_back(NULL);
> > +        Vals.push_back(forceUnrollNode); #else
> >          SmallVector<Value *, 2> forceUnroll;
> >          forceUnroll.push_back(MDString::get(Context,
> > "llvm.loop.unroll.enable"));
> >
> > forceUnroll.push_back(ConstantInt::get(Type::getInt1Ty(Context),
> > enable));
> > @@ -112,6 +125,7 @@ namespace gbe {
> >          SmallVector<Value *, 4> Vals;
> >          Vals.push_back(NULL);
> >          Vals.push_back(forceUnrollNode);
> > +#endif
> >          MDNode *NewLoopID = MDNode::get(Context, Vals);
> >          // Set operand 0 to refer to the loop id itself.
> >          NewLoopID->replaceOperandWith(0, NewLoopID);

From daniel at ffwll.ch  Tue Mar  3 00:54:39 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Tue, 3 Mar 2015 09:54:39 +0100
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150303085439.GJ18775@phenom.ffwll.local>

On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> Setup new I915_GETPARAM ioctl entries for subslice total and
> EU total. Userspace drivers need these values when constructing
> GPGPU commands. This kernel query method is intended to replace
> the PCI ID-based tables that userspace drivers currently maintain.
> The kernel driver can employ fuse register reads as needed to
> ensure the most accurate determination of GT config attributes.
> This first became important with Cherryview in which the config
> could differ between devices with the same PCI ID.
> 
> The kernel detection of these values is device-specific and not
> included in this patch. Because zero is not a valid value for any of
> these parameters, a value of zero is interpreted as unknown for the
> device. Userspace drivers should continue to maintain ID-based tables
> for older devices not supported by the new query method.
> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
>  include/uapi/drm/i915_drm.h     |  2 ++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 053e178..9350ea2 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_MMAP_VERSION:
>  		value = 1;
>  		break;
> +	case I915_PARAM_SUBSLICE_TOTAL:
> +		value = INTEL_INFO(dev)->subslice_total;
> +		if (!value)
> +			return -ENODEV;
> +		break;
> +	case I915_PARAM_EU_TOTAL:
> +		value = INTEL_INFO(dev)->eu_total;
> +		if (!value)
> +			return -ENODEV;

I need the corresponding userspace support before I can merged this.

Thanks, Daniel

> +		break;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", param->param);
>  		return -EINVAL;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 6eed16b..8672efc 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -347,6 +347,8 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>  #define I915_PARAM_MMAP_VERSION          30
>  #define I915_PARAM_HAS_BSD2		 31
> +#define I915_PARAM_SUBSLICE_TOTAL	 32
> +#define I915_PARAM_EU_TOTAL		 33
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> -- 
> 2.3.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From daniel at ffwll.ch  Tue Mar  3 00:56:49 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Tue, 3 Mar 2015 09:56:49 +0100
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <20150303085439.GJ18775@phenom.ffwll.local>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
 <20150303085439.GJ18775@phenom.ffwll.local>
Message-ID: <20150303085649.GK18775@phenom.ffwll.local>

On Tue, Mar 03, 2015 at 09:54:39AM +0100, Daniel Vetter wrote:
> On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> > From: Jeff McGee <jeff.mcgee at intel.com>
> > 
> > Setup new I915_GETPARAM ioctl entries for subslice total and
> > EU total. Userspace drivers need these values when constructing
> > GPGPU commands. This kernel query method is intended to replace
> > the PCI ID-based tables that userspace drivers currently maintain.
> > The kernel driver can employ fuse register reads as needed to
> > ensure the most accurate determination of GT config attributes.
> > This first became important with Cherryview in which the config
> > could differ between devices with the same PCI ID.
> > 
> > The kernel detection of these values is device-specific and not
> > included in this patch. Because zero is not a valid value for any of
> > these parameters, a value of zero is interpreted as unknown for the
> > device. Userspace drivers should continue to maintain ID-based tables
> > for older devices not supported by the new query method.
> > 
> > For: VIZ-4636
> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
> >  include/uapi/drm/i915_drm.h     |  2 ++
> >  2 files changed, 12 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 053e178..9350ea2 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
> >  	case I915_PARAM_MMAP_VERSION:
> >  		value = 1;
> >  		break;
> > +	case I915_PARAM_SUBSLICE_TOTAL:
> > +		value = INTEL_INFO(dev)->subslice_total;
> > +		if (!value)
> > +			return -ENODEV;
> > +		break;
> > +	case I915_PARAM_EU_TOTAL:
> > +		value = INTEL_INFO(dev)->eu_total;
> > +		if (!value)
> > +			return -ENODEV;
> 
> I need the corresponding userspace support before I can merged this.

Strike that, I've missed the beignet support. As soon as beignet
maintainers have that reviewed I can pull this in.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From ruiling.song at intel.com  Wed Mar  4 00:24:17 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Wed, 4 Mar 2015 08:24:17 +0000
Subject: [Beignet] [PATCH] Fix llvm3.6 build error.
In-Reply-To: <1425358423-19565-1-git-send-email-rong.r.yang@intel.com>
References: <1425358423-19565-1-git-send-email-rong.r.yang@intel.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7DC966@SHSMSX101.ccr.corp.intel.com>

LGTM. And I tried, it works well with llvm 3.6

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Yang Rong
> Sent: Tuesday, March 03, 2015 12:54 PM
> To: beignet at lists.freedesktop.org
> Cc: Yang, Rong R
> Subject: [Beignet] [PATCH] Fix llvm3.6 build error.
> 
> LLVM3.6 revert the c api LLVMLinkModules to LLVM3.5 last-minute. Consist
> with it.
> ---
>  backend/src/backend/gen_program.cpp    | 4 ----
>  backend/src/llvm/llvm_bitcode_link.cpp | 4 ----
>  2 files changed, 8 deletions(-)
> 
> diff --git a/backend/src/backend/gen_program.cpp
> b/backend/src/backend/gen_program.cpp
> index 2525825..755c60e 100644
> --- a/backend/src/backend/gen_program.cpp
> +++ b/backend/src/backend/gen_program.cpp
> @@ -388,11 +388,7 @@ namespace gbe {
>        llvm::Module* src =
> (llvm::Module*)((GenProgram*)src_program)->module;
>        llvm::Module* dst =
> (llvm::Module*)((GenProgram*)dst_program)->module;
> 
> -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>        if (LLVMLinkModules(wrap(dst), wrap(src),
> LLVMLinkerPreserveSource, &errMsg)) { -#else
> -      if (LLVMLinkModules(wrap(dst), wrap(src), 0, &errMsg)) {
> -#endif
>          if (err != NULL && errSize != NULL && stringSize > 0u) {
>            strncpy(err, errMsg, stringSize-1);
>            err[stringSize-1] = '\0';
> diff --git a/backend/src/llvm/llvm_bitcode_link.cpp
> b/backend/src/llvm/llvm_bitcode_link.cpp
> index 96f7781..17248c0 100644
> --- a/backend/src/llvm/llvm_bitcode_link.cpp
> +++ b/backend/src/llvm/llvm_bitcode_link.cpp
> @@ -240,11 +240,7 @@ namespace gbe
>      /* We use beignet's bitcode as dst because it will have a lot of
>         lazy functions which will not be loaded. */
>      char* errorMsg;
> -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 5
>      if(LLVMLinkModules(wrap(clonedLib), wrap(mod),
> LLVMLinkerDestroySource, &errorMsg)) { -#else
> -    if(LLVMLinkModules(wrap(clonedLib), wrap(mod), 0, &errorMsg)) {
> -#endif
>        delete clonedLib;
>        printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg);
>        return NULL;
> --
> 2.1.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From junyan.he at inbox.com  Wed Mar  4 02:43:51 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Wed,  4 Mar 2015 18:43:51 +0800
Subject: [Beignet] [PATCH 1/6] Add the indirect fields and functions for gen
	register.
Message-ID: <1425465831-20732-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Add a0_subnr and addr_imm to GenRegister, in order to
represent the indirect register, which may be some
imm offset from a0.x subregister's base address.
Also add to_indirect1xN help function to convert a register
to an indirect 1XN register.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_register.hpp |   30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 08c7277..3b40b67 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -205,6 +205,8 @@ namespace gbe
       this->quarter = 0;
       this->nr = this->subnr = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! For specific physical registers only */
@@ -229,6 +231,8 @@ namespace gbe
       this->hstride = hstride;
       this->quarter = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! Return the IR virtual register */
@@ -258,6 +262,8 @@ namespace gbe
     uint32_t hstride:2;      //!< Horizontal stride
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
+    uint32_t a0_subnr:4;     //!< In indirect mode, use a0.nr as the base.
+    int32_t addr_imm:10;     //!< In indirect mode, the imm as address offset from a0.
 
     static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
       GenRegister r = reg;
@@ -835,6 +841,28 @@ namespace gbe
       return reg;
     }
 
+    /*! convert one register to indirectly mode */
+    static INLINE GenRegister to_indirect1xN(GenRegister reg, uint32_t base_addr,
+                                          int32_t imm_off = 4096, int a0_subnr = 0) {
+      GenRegister r = reg;
+      int32_t offset;
+      if (imm_off > 4095) {
+        offset = (r.nr*32 + r.subnr) - base_addr;
+      } else {
+        offset = imm_off;
+      }
+
+      GBE_ASSERT(offset <= 511 && offset>=-512);
+      r.a0_subnr = a0_subnr;
+      r.addr_imm = offset;
+      r.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+
+      r.width = GEN_WIDTH_1;
+      r.vstride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      r.hstride = GEN_HORIZONTAL_STRIDE_0;
+      return r;
+    }
+
     static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return GenRegister(file,
                          nr,
@@ -953,7 +981,7 @@ namespace gbe
     }
 
     static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
-      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+      return offset(retype(vec1(file, nr, 0), GEN_TYPE_UW), 0, typeSize(GEN_TYPE_UW)*subnr);
     }
 
     static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
-- 
1.7.9.5


From junyan.he at inbox.com  Wed Mar  4 02:43:58 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Wed,  4 Mar 2015 18:43:58 +0800
Subject: [Beignet] [PATCH 2/6] Delete bswap logic in the llvm_to_gen stage.
Message-ID: <1425465838-20766-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

We delete bswap when llvm_to_gen, and add BSWAP
instruction to handle. We will handle the bswap
in backend as a special instruction.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/ir/instruction.hpp        |    2 +
 backend/src/ir/instruction.hxx        |    1 +
 backend/src/llvm/llvm_gen_backend.cpp |   85 +--------------------------------
 3 files changed, 5 insertions(+), 83 deletions(-)

diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 6963111..24d27aa 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -620,6 +620,8 @@ namespace ir {
   Instruction RNDU(Type type, Register dst, Register src);
   /*! rndz.type dst src */
   Instruction RNDZ(Type type, Register dst, Register src);
+  /*! bswap.type dst src */
+  Instruction BSWAP(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b52673e..de4abfb 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -40,6 +40,7 @@ DECL_INSN(RNDU, UnaryInstruction)
 DECL_INSN(RNDZ, UnaryInstruction)
 DECL_INSN(SIMD_ANY, UnaryInstruction)
 DECL_INSN(SIMD_ALL, UnaryInstruction)
+DECL_INSN(BSWAP, UnaryInstruction)
 DECL_INSN(POW, BinaryInstruction)
 DECL_INSN(MUL, BinaryInstruction)
 DECL_INSN(ADD, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index aad638f..74c80ee 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2943,89 +2943,6 @@ namespace gbe
           case Intrinsic::umul_with_overflow:
           NOT_IMPLEMENTED;
           break;
-          case Intrinsic::bswap:
-          {
-            // FIXME, this is an unoptimized version, could be optimized by
-            // leveraging GEN's register region/indirect address feature.
-            Type *llvmDstType = I.getType();
-            uint32_t elementSize = getTypeByteSize(unit, llvmDstType);
-
-            const ir::Register dst0  = this->getRegister(&I);
-            const ir::Register src0 = this->getRegister(I.getOperand(0));
-            switch(elementSize)
-            {
-              case 2:
-                {
-                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regWMask = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x00FF, ir::TYPE_S16);
-                  ir::Register regShift = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex shift = ctx.newIntegerImmediate(8, ir::TYPE_S16);
-
-                  ctx.LOADI(ir::TYPE_S16, regWMask, wMask);
-                  ctx.AND(srcType, tmp1, src0, regWMask);
-
-                  ctx.LOADI(ir::TYPE_S16, regShift, shift);
-                  ctx.SHL(srcType, tmp2, tmp1, regShift);
-
-                  ir::Register tmp3 = ctx.reg( getFamily(srcType) );
-                  ctx.SHR(srcType, tmp3, src0, regShift);
-
-                  ctx.OR(srcType, dst0, tmp2, tmp3);
-                }
-                break;
-              case 4:
-                {
-                  ir::Type srcType = getType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp3 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp4 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp5 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp6 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regDWMask = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_8 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_24 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::ImmediateIndex wMask_L = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
-                  ir::ImmediateIndex wMask_H = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_8 = ctx.newIntegerImmediate(8, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_24 = ctx.newIntegerImmediate(24, ir::TYPE_S32);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHL(srcType, tmp1, src0, regShift_24);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_L);
-                  ctx.AND(srcType, tmp2, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHL(srcType, tmp3, tmp2, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_H);
-                  ctx.AND(srcType, tmp4, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp5, tmp4, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp6, src0, regShift_24);
-
-                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
-                  ctx.OR(srcType, tmp7, tmp1, tmp3);
-                  ctx.OR(srcType, tmp8, tmp5, tmp6);
-                  ctx.OR(srcType, dst0, tmp7, tmp8);
-                }
-                break;
-              case 8:
-                NOT_IMPLEMENTED;
-                break;
-              default:
-                GBE_ASSERT(0);
-            }
-          }
-          break;
           case Intrinsic::ctlz:
           {
             Type *llvmDstType = I.getType();
@@ -3085,6 +3002,8 @@ namespace gbe
           case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
           case Intrinsic::exp2: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
+          case Intrinsic::bswap:
+            this->emitUnaryCallInst(I,CS,ir::OP_BSWAP, getUnsignedType(ctx, I.getType())); break;
           default: NOT_IMPLEMENTED;
         }
       } else {
-- 
1.7.9.5


From junyan.he at inbox.com  Wed Mar  4 02:44:04 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Wed,  4 Mar 2015 18:44:04 +0800
Subject: [Beignet] [PATCH 3/6] Add functions to set a0 register.
Message-ID: <1425465844-20800-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

a0 as a address register acts a very important role in
indirect mode access. We add auxiliary functions to set
its content correctly and effectively.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp |   42 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen_context.hpp |    3 +++
 2 files changed, 45 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index f8748ad..6856510 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1801,6 +1801,48 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
+  void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+    if (sz == 0)
+      sz = 8;
+    GBE_ASSERT(sz%2 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(a0[0] + diff < 4096 && a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset) {
+      p->push();
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/2; i++) {
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+            GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(char)*8);
+    memset(&this->a0[8], 0, sizeof(char)*8);
+  }
+
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index f64b916..6ca88db 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -208,6 +208,9 @@ namespace gbe
     /*! allocate a new curbe register and insert to curbe pool. */
     void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
 
+    uint16_t a0[16];
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
-- 
1.7.9.5


From junyan.he at inbox.com  Wed Mar  4 02:44:10 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Wed,  4 Mar 2015 18:44:10 +0800
Subject: [Beignet] [PATCH 4/6] Correct indirect mode encoder setting for Gen7
Message-ID: <1425465850-20835-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen7_encoder.cpp |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index ecf5b39..a7d132c 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -118,14 +118,14 @@ namespace gbe
     } else {
       gen7_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen7_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen7_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen7_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen7_insn->bits2.ia1.src0_abs = 0;
-      gen7_insn->bits2.ia1.src0_negate = 0;
+      gen7_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen7_insn->bits2.ia1.src0_indirect_offset = reg.addr_imm;
+      gen7_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen7_insn->bits2.ia1.src0_negate = reg.negation;
       gen7_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen7_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen7_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen7_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen7_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen7_insn->bits2.ia1.src0_width = reg.width;
+      gen7_insn->bits2.ia1.src0_vert_stride = reg.vstride;
     }
   }
 
-- 
1.7.9.5


From junyan.he at inbox.com  Wed Mar  4 02:44:17 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Wed,  4 Mar 2015 18:44:17 +0800
Subject: [Beignet] [PATCH 5/6] Handle the bswap using indirect mode access.
Message-ID: <1425465857-20869-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

The swap for short will be like:
mov(1)   a0<1>:UD        0xe600e61UD            { align1 WE_all };
mov(1)   a0.1<1>:UD      0xe620e63UD            { align1 WE_all };
mov(1)   a0.2<1>:UD      0xe640e65UD            { align1 WE_all };
mov(1)   a0.3<1>:UD      0xe660e67UD            { align1 WE_all };
mov(8)   g114<1>:UB      g[a0]<VxH,1,0>:UB      { align1 WE_all 1Q };
mov(8)   g114.8<1>:UB    g[a0 8]<VxH,1,0>:UB    { align1 WE_all 1Q };
mov(8)   g114.16<1>:UB   g[a0 16]<VxH,1,0>:UB   { align1 WE_all 1Q };
mov(8)   g114.24<1>:UB   g[a0 24]<VxH,1,0>:UB   { align1 WE_all 1Q };
mov(16)  g113<1>:UW      g114<8,8,1>:UW         { align1 WE_normal 1H };

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp        |  112 ++++++++++++++++++++++++++++
 backend/src/backend/gen_insn_selection.cpp |    9 +++
 backend/src/backend/gen_insn_selection.hxx |    1 +
 3 files changed, 122 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 6856510..46b4a06 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -297,6 +297,118 @@ namespace gbe
           p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
         break;
       }
+      case SEL_OP_BSWAP: {
+        uint32_t simd = p->curr.execWidth;
+        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+        uint16_t new_a0[16];
+        memset(new_a0, 0, sizeof(new_a0));
+
+        GBE_ASSERT(src.type == dst.type);
+        uint32_t start_addr = src.nr*32 + src.subnr;
+
+        if (simd == 1) {
+          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            this->setA0Content(new_a0, 0, 4);
+
+            p->push();
+            p->curr.execWidth = 4;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            GenRegister dst_ = dst;
+            dst_.type = GEN_TYPE_UB;
+            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+            dst_.width = GEN_WIDTH_4;
+            dst_.vstride = GEN_VERTICAL_STRIDE_4;
+            p->MOV(dst_, ind_src);
+            p->pop();
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                GenRegister::retype(src, GEN_TYPE_UB));
+          } else {
+            GBE_ASSERT(0);
+          }
+        } else {
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            GBE_ASSERT(src.subnr == 0);
+            GBE_ASSERT(dst.subnr == 0);
+            GBE_ASSERT(tmp.subnr == 0);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            new_a0[4] = start_addr + 7;
+            new_a0[5] = start_addr + 6;
+            new_a0[6] = start_addr + 5;
+            new_a0[7] = start_addr + 4;
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < 4; i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            if (simd == 16) {
+              for (int i = 0; i < 4; i++) {
+                ind_src.addr_imm += 8;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
+              }
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
+            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 1;
+            new_a0[1] = start_addr;
+            new_a0[2] = start_addr + 3;
+            new_a0[3] = start_addr + 2;
+            new_a0[4] = start_addr + 5;
+            new_a0[5] = start_addr + 4;
+            new_a0[6] = start_addr + 7;
+            new_a0[7] = start_addr + 6;
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else {
+            GBE_ASSERT(0);
+          }
+        }
+      }
+      break;
       default:
         NOT_IMPLEMENTED;
     }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d100f80..2b166b1 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -498,6 +498,7 @@ namespace gbe
     ALU1(RNDE)
     ALU1(F16TO32)
     ALU1(F32TO16)
+    ALU1WithTemp(BSWAP)
     ALU2(SEL)
     ALU2(SEL_INT64)
     ALU1(NOT)
@@ -2121,6 +2122,14 @@ namespace gbe
           case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
           case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
           case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+          case ir::OP_BSWAP:
+            {
+              ir::Register tmp = sel.reg(getFamily(insnType));
+              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
+              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
+              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
+              break;
+            }
           case ir::OP_SIMD_ANY:
             {
               const GenRegister constZero = GenRegister::immuw(0);;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index be1f7ec..09f5aaf 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,6 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
 DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
-- 
1.7.9.5


From junyan.he at inbox.com  Wed Mar  4 02:44:24 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Wed,  4 Mar 2015 18:44:24 +0800
Subject: [Beignet] [PATCH 6/6] Modify the bswap test case.
Message-ID: <1425465864-20903-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 kernels/compiler_bswap.cl |   24 +++---
 utests/compiler_bswap.cpp |  203 +++++++++++++++++++++++++++++++--------------
 2 files changed, 156 insertions(+), 71 deletions(-)

diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
index 97313b1..3a0a373 100644
--- a/kernels/compiler_bswap.cl
+++ b/kernels/compiler_bswap.cl
@@ -1,13 +1,17 @@
-#define TEST_TYPE(TYPE, LENGTH)                                       \
-kernel void compiler_bswap_##TYPE(global TYPE * src, global TYPE * dst){ \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(src[get_global_id(0)]); \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(dst[get_global_id(0)] -1 ); \
-}
+kernel void compiler_bswap(global uint * src0, global uint * dst0, global ushort * src1, global ushort * dst1,
+    int src2, global int * dst2,  short src3, global short * dst3) {
+  if (get_global_id(0) % 2 == 0) {
+    dst0[get_global_id(0)] = __builtin_bswap32(src0[get_global_id(0)]);
+  } else {
+    dst0[get_global_id(0)] = src0[get_global_id(0)];
+  }
 
+  dst1[get_global_id(0)] = __builtin_bswap16(src1[get_global_id(0)]);
+  if (get_global_id(0) % 2 == 1) {
+    dst1[get_global_id(0)] = __builtin_bswap16(dst1[get_global_id(0)] + 1);
+  }
 
-TEST_TYPE(short, 16)
-TEST_TYPE(ushort, 16)
-TEST_TYPE(int, 32)
-TEST_TYPE(uint, 32)
+  dst2[get_global_id(0)] = __builtin_bswap32(src2);
+  dst3[get_global_id(0)] = __builtin_bswap16(src3);
+}
 
-#undef TEST_TYPE
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
index 9475b99..3af9ef5 100644
--- a/utests/compiler_bswap.cpp
+++ b/utests/compiler_bswap.cpp
@@ -1,7 +1,6 @@
 #include "utest_helper.hpp"
 #include "string.h"
 
-namespace {
 #define cpu_htons(A)     ((((uint16_t)(A) & 0xff00) >> 8) | \
     (((uint16_t)(A) & 0x00ff) << 8))
 #define cpu_htonl(A)     ((((uint32_t)(A) & 0xff000000) >> 24) | \
@@ -9,108 +8,190 @@ namespace {
     (((uint32_t)(A) & 0x0000ff00) << 8) | \
     (((uint32_t)(A) & 0x000000ff) << 24))
 
+
+template <typename T> static void gen_rand_val(T & val)
+{
+  val = static_cast<T>(rand());//(0xAABBCCDD);//
+}
+
 template <typename T> static void cpu(int global_id, T *src, T *dst)
 {
-    T f = src[global_id];
-    T g = 0;
-    if(sizeof(T) == sizeof(int16_t))
-      g = cpu_htons(f);
-    else if(sizeof(T) == sizeof(int32_t))
-      g = cpu_htonl(f);
-    dst[global_id] = g;
+  T f = src[global_id];
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T> static void gen_rand_val (T & val)
+template <typename T> static void cpu(int global_id, T src, T *dst)
 {
-    val = static_cast<T>(rand() );
+  T f = src;
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T>
-inline static void print_data (T& val)
+template <typename T> inline static void print_data(T& val)
 {
-    if(sizeof(T) == sizeof(uint16_t))
-        printf(" %hx", val);
-    else
-        printf(" %x", val);
+  if(sizeof(T) == sizeof(uint16_t))
+    printf(" 0x%hx", val);
+  else
+    printf(" 0x%x", val);
 }
 
-template <typename T> static void dump_data (T* src, T* dst, int n)
+template <typename T> static void dump_data(T* raw, T* cpu, T* gpu, int n)
 {
-    printf("\nRaw: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[0])[i]);
-    }
+  printf("\nRaw: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(raw[i]);
+  }
 
-    printf("\nCPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(dst[i]);
-    }
-    printf("\nGPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[1])[i]);
-    }
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
 }
 
-template<typename T>
-void test(const char *kernel_name)
+template <typename T> static void dump_data(T raw, T* cpu, T* gpu, int n)
 {
-  const size_t n = 64;
-  T cpu_dst[n];
-  T cpu_src[n];
+  printf("\nRaw: \n");
+  print_data(raw);
+
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
+}
+
+void compiler_bswap(void)
+{
+  const size_t n = 32;
+  uint32_t src0[n];
+  uint16_t src1[n];
+  uint32_t dst0[n];
+  uint16_t dst1[n];
+  int32_t src2 = static_cast<int32_t>(rand());
+  int32_t dst2[n];
+  int16_t src3 = static_cast<int16_t>(rand());
+  int16_t dst3[n];
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", kernel_name);
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", "compiler_bswap");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(src0), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, sizeof(dst0), NULL);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
 
+  OCL_CREATE_BUFFER(buf[2], 0, sizeof(src1), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_CREATE_BUFFER(buf[3], 0, sizeof(dst1), NULL);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+  OCL_SET_ARG(4, sizeof(int32_t), &src2);
+  OCL_CREATE_BUFFER(buf[4], 0, sizeof(dst2), NULL);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+
+  OCL_SET_ARG(6, sizeof(int16_t), &src3);
+  OCL_CREATE_BUFFER(buf[5], 0, sizeof(dst3), NULL);
+  OCL_SET_ARG(7, sizeof(cl_mem), &buf[5]);
+
   OCL_MAP_BUFFER(0);
   for (int32_t i = 0; i < (int32_t) n; ++i) {
-    gen_rand_val(cpu_src[i]);
+    gen_rand_val(src0[i]);
   }
-
-  memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+  memcpy(buf_data[0], src0, sizeof(src0));
+  OCL_UNMAP_BUFFER(0);
 
   /* Clear the dst buffer to avoid random data. */
   OCL_MAP_BUFFER(1);
-  memset(buf_data[1], 0, sizeof(T) * n);
+  memset(buf_data[1], 0, sizeof(dst0));
   OCL_UNMAP_BUFFER(1);
 
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    gen_rand_val(src1[i]);
+  }
+  memcpy(buf_data[2], src1, sizeof(src1));
+  OCL_UNMAP_BUFFER(2);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(3);
+  memset(buf_data[3], 0, sizeof(dst1));
+  OCL_UNMAP_BUFFER(3);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(4);
+  memset(buf_data[4], 0, sizeof(dst2));
+  OCL_UNMAP_BUFFER(4);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(5);
+  memset(buf_data[5], 0, sizeof(dst3));
+  OCL_UNMAP_BUFFER(5);
+
   globals[0] = n;
   locals[0] = 16;
   OCL_NDRANGE(1);
 
   // Run on CPU
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_src, cpu_dst);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%2) {
+      dst0[i] = src0[i];
+      continue;
+    }
+    cpu(i, src0, dst0);
+  }
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu(i, src1, dst1);
+
+    if (i%2) {
+      dst1[i] = dst1[i] + 1;
+      cpu(i, dst1, dst1);
+    }
+  }
 
+  // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu_dst[i] = cpu_dst[i] -1;
+    cpu(i, src2, dst2);
 
   // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_dst, cpu_dst);
+    cpu(i, src3, dst3);
 
   OCL_MAP_BUFFER(1);
- // dump_data(cpu_src, cpu_dst, n);
+  //dump_data(src0, dst0, (uint32_t *)buf_data[1], n);
+  OCL_ASSERT(!memcmp(buf_data[1], dst0, sizeof(dst0)));
+  OCL_UNMAP_BUFFER(1);
 
-  OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+  OCL_MAP_BUFFER(3);
+  //dump_data(src1, dst1, (uint16_t *)buf_data[3], n);
+  OCL_ASSERT(!memcmp(buf_data[3], dst1, sizeof(dst1)));
+  OCL_UNMAP_BUFFER(3);
 
-  OCL_UNMAP_BUFFER(1);
-  OCL_UNMAP_BUFFER(0);
-}
+  OCL_MAP_BUFFER(4);
+  //dump_data(src2, dst2, (int32_t *)buf_data[4], n);
+  OCL_ASSERT(!memcmp(buf_data[4], dst2, sizeof(dst2)));
+  OCL_UNMAP_BUFFER(4);
 
+  OCL_MAP_BUFFER(5);
+  //dump_data(src3, dst3, (int16_t *)buf_data[5], n);
+  OCL_ASSERT(!memcmp(buf_data[5], dst3, sizeof(dst3)));
+  OCL_UNMAP_BUFFER(5);
 }
 
-#define compiler_bswap(type, kernel) \
-static void compiler_bswap_ ##type(void)\
-{\
-  test<type>(# kernel);\
-}\
-MAKE_UTEST_FROM_FUNCTION(compiler_bswap_ ## type);
-
-compiler_bswap(int16_t, compiler_bswap_short)
-compiler_bswap(uint16_t, compiler_bswap_ushort)
-compiler_bswap(int32_t, compiler_bswap_int)
-compiler_bswap(uint32_t, compiler_bswap_uint)
+MAKE_UTEST_FROM_FUNCTION(compiler_bswap);
-- 
1.7.9.5


From ruiling.song at intel.com  Wed Mar  4 18:56:52 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Thu, 5 Mar 2015 02:56:52 +0000
Subject: [Beignet] Preventing zero GPU virtual address allocation
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>

Hi Daniel,

OpenCL language support NULL pointer, using zero as the NULL pointer is the obvious way. That is zero will be treated as invalid address.
Then it requires drm won't allocate zero to drm buffer. And David in CC list has help us make a patch, please see attached. The logic is only for
ppgtt, and he said zero offset is used under ggtt. My question is what is offset zero used under ggtt? Will it make sure zero is not allocatable to drm buffer object?

Ruiling
-------------- next part --------------
A non-text attachment was scrubbed...
Name: nozerooffset2.patch
Type: application/octet-stream
Size: 583 bytes
Desc: nozerooffset2.patch
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20150305/db516d4e/attachment.obj>

From zhigang.gong at linux.intel.com  Wed Mar  4 20:35:55 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Thu, 5 Mar 2015 12:35:55 +0800
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150305043555.GA20578@ivb-gt2-rev4>

There is one minor conflict when apply the KMD patch to latest
drm-intel-nightly branch. It should be easy to fix.

Another issue is that IMO, we should bump libdrm's version number
when increase these new APIs. Then in Beignet, we can check the
libdrm version at build time and determine whether we will use
these new interfaces. Thus, we can avoid breaking beignet on
those systems which have previous libdrm/kernel installed.

The other parts of the whole patchset,
including patches for KMD/libdrm/Intel gpu tools and Beignet,
all look good to me.

And I just tested it on BDW and SKL platforms, it works fine.

Thanks,
Zhigang Gong.

On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> Setup new I915_GETPARAM ioctl entries for subslice total and
> EU total. Userspace drivers need these values when constructing
> GPGPU commands. This kernel query method is intended to replace
> the PCI ID-based tables that userspace drivers currently maintain.
> The kernel driver can employ fuse register reads as needed to
> ensure the most accurate determination of GT config attributes.
> This first became important with Cherryview in which the config
> could differ between devices with the same PCI ID.
> 
> The kernel detection of these values is device-specific and not
> included in this patch. Because zero is not a valid value for any of
> these parameters, a value of zero is interpreted as unknown for the
> device. Userspace drivers should continue to maintain ID-based tables
> for older devices not supported by the new query method.
> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
>  include/uapi/drm/i915_drm.h     |  2 ++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 053e178..9350ea2 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_MMAP_VERSION:
>  		value = 1;
>  		break;
> +	case I915_PARAM_SUBSLICE_TOTAL:
> +		value = INTEL_INFO(dev)->subslice_total;
> +		if (!value)
> +			return -ENODEV;
> +		break;
> +	case I915_PARAM_EU_TOTAL:
> +		value = INTEL_INFO(dev)->eu_total;
> +		if (!value)
> +			return -ENODEV;
> +		break;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", param->param);
>  		return -EINVAL;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 6eed16b..8672efc 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -347,6 +347,8 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
>  #define I915_PARAM_MMAP_VERSION          30
>  #define I915_PARAM_HAS_BSD2		 31
> +#define I915_PARAM_SUBSLICE_TOTAL	 32
> +#define I915_PARAM_EU_TOTAL		 33
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> -- 
> 2.3.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From daniel at ffwll.ch  Thu Mar  5 04:52:51 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Thu, 5 Mar 2015 13:52:51 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
Message-ID: <20150305125251.GA18775@phenom.ffwll.local>

On Thu, Mar 05, 2015 at 02:56:52AM +0000, Song, Ruiling wrote:
> Hi Daniel,
> 
> OpenCL language support NULL pointer, using zero as the NULL pointer is
> the obvious way. That is zero will be treated as invalid address.  Then
> it requires drm won't allocate zero to drm buffer. And David in CC
> list has help us make a patch, please see attached. The logic is only
> for ppgtt, and he said zero offset is used under ggtt. My question is
> what is offset zero used under ggtt? Will it make sure zero is not
> allocatable to drm buffer object?

The code in i915_gem_execbuf.c already supports an optional bias to avoid
putting a buffer into the first few kb. See __EXEC_OBJECT_NEEDS_BIAS. I
suggest you expose this to userspace, which also address your issue that
you didn't add an abi revision flag.

And since this is abi I need open-source userspace and all that.

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From malcolm.i.w.roberts at gmail.com  Thu Mar  5 06:51:21 2015
From: malcolm.i.w.roberts at gmail.com (Malcolm Roberts)
Date: Thu, 05 Mar 2015 15:51:21 +0100
Subject: [Beignet] double precision support
Message-ID: <54F86D69.9080209@gmail.com>

Hello.

I just installed the latest version of beignet from git 
(74390cabe5d2958fec5806a5099bad44c74798f5) and I notice that cl_khr_fp64 
is not available.  I am a bit late for the conversation in July 
(http://lists.freedesktop.org/archives/beignet/2014-July/003599.html) 
but I thought that I would add my voice.

I have a project to do scientific computation using the OpenCL platform, 
and double precision is vital for my work.  I understand that the 
beignet teams has limited resources and that one must prioritize, but I 
feel that double-precision would be a great addition.

Best,

~Malcolm Roberts
http://malcolmiwroberts.com/

From daniel at ffwll.ch  Thu Mar  5 07:27:59 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Thu, 5 Mar 2015 16:27:59 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150305130121.GA18784@nuc-i3427.alporthouse.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
Message-ID: <20150305152758.GI18775@phenom.ffwll.local>

On Thu, Mar 05, 2015 at 01:01:21PM +0000, Chris Wilson wrote:
> On Thu, Mar 05, 2015 at 01:52:51PM +0100, Daniel Vetter wrote:
> > On Thu, Mar 05, 2015 at 02:56:52AM +0000, Song, Ruiling wrote:
> > > Hi Daniel,
> > > 
> > > OpenCL language support NULL pointer, using zero as the NULL pointer is
> > > the obvious way. That is zero will be treated as invalid address.  Then
> > > it requires drm won't allocate zero to drm buffer. And David in CC
> > > list has help us make a patch, please see attached. The logic is only
> > > for ppgtt, and he said zero offset is used under ggtt. My question is
> > > what is offset zero used under ggtt? Will it make sure zero is not
> > > allocatable to drm buffer object?
> > 
> > The code in i915_gem_execbuf.c already supports an optional bias to avoid
> > putting a buffer into the first few kb. See __EXEC_OBJECT_NEEDS_BIAS. I
> > suggest you expose this to userspace, which also address your issue that
> > you didn't add an abi revision flag.
> 
> A better API would be to allow userspace to request a buffer to place at
> a specific point in the VM and fail if that is not possible aka
> soft-pinning. Then OCL could assign a bo to offset 0 and detect writes
> to the NULL address if it so desired. With full-ppgtt, userspace can be
> sure of being able to evict any location in its VM and so also allows
> graceful detection of scenarios under which it cannot provide the NULL
> address safety feature (and opt not to run, or just bury its head in the
> sand).

I recommended exposing the PIN_BIAS since that will work without full
ppgtt too. And yeah for full ppgtt we could just use svm where userspace
controls the address, but since that's still a bit out we might need a
quick interim solution?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From chris at chris-wilson.co.uk  Thu Mar  5 05:01:21 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Thu, 5 Mar 2015 13:01:21 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150305125251.GA18775@phenom.ffwll.local>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
Message-ID: <20150305130121.GA18784@nuc-i3427.alporthouse.com>

On Thu, Mar 05, 2015 at 01:52:51PM +0100, Daniel Vetter wrote:
> On Thu, Mar 05, 2015 at 02:56:52AM +0000, Song, Ruiling wrote:
> > Hi Daniel,
> > 
> > OpenCL language support NULL pointer, using zero as the NULL pointer is
> > the obvious way. That is zero will be treated as invalid address.  Then
> > it requires drm won't allocate zero to drm buffer. And David in CC
> > list has help us make a patch, please see attached. The logic is only
> > for ppgtt, and he said zero offset is used under ggtt. My question is
> > what is offset zero used under ggtt? Will it make sure zero is not
> > allocatable to drm buffer object?
> 
> The code in i915_gem_execbuf.c already supports an optional bias to avoid
> putting a buffer into the first few kb. See __EXEC_OBJECT_NEEDS_BIAS. I
> suggest you expose this to userspace, which also address your issue that
> you didn't add an abi revision flag.

A better API would be to allow userspace to request a buffer to place at
a specific point in the VM and fail if that is not possible aka
soft-pinning. Then OCL could assign a bo to offset 0 and detect writes
to the NULL address if it so desired. With full-ppgtt, userspace can be
sure of being able to evict any location in its VM and so also allows
graceful detection of scenarios under which it cannot provide the NULL
address safety feature (and opt not to run, or just bury its head in the
sand).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From chris at chris-wilson.co.uk  Thu Mar  5 13:07:02 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Thu, 5 Mar 2015 21:07:02 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150305152758.GI18775@phenom.ffwll.local>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
Message-ID: <20150305210702.GC18784@nuc-i3427.alporthouse.com>

On Thu, Mar 05, 2015 at 04:27:59PM +0100, Daniel Vetter wrote:
> I recommended exposing the PIN_BIAS since that will work without full
> ppgtt too. And yeah for full ppgtt we could just use svm where userspace
> controls the address, but since that's still a bit out we might need a
> quick interim solution?

Letting userspace control the address of bo used in a batch is about 2
patches each of ~100 lines. And it could be used will full-ppgtt before
svm if mesa wants to take complete control of its layout. I think it is
one of those useful tools that is likely to find uses far beyond the
initial justification.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From nanhai.zou at intel.com  Thu Mar  5 18:11:18 2015
From: nanhai.zou at intel.com (Zou, Nanhai)
Date: Fri, 6 Mar 2015 02:11:18 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual
	address	allocation
In-Reply-To: <20150305152758.GI18775@phenom.ffwll.local>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
Message-ID: <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>

I don't understand why we need a complex solution when there is already a simple solution with patch.
What is the drawback of reserving page 0?
Before we going to that complex solution, could we just reserve page zero?
It is simple and straight forward.

Thanks
Zou Nanhai

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Daniel Vetter
> Sent: Thursday, March 05, 2015 11:28 PM
> To: Chris Wilson; Daniel Vetter; Song, Ruiling; Vetter, Daniel;
> intel-gfx at lists.freedesktop.org; Yang, Rong R; beignet at lists.freedesktop.org;
> Weinehall, David
> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address allocation
> 
> On Thu, Mar 05, 2015 at 01:01:21PM +0000, Chris Wilson wrote:
> > On Thu, Mar 05, 2015 at 01:52:51PM +0100, Daniel Vetter wrote:
> > > On Thu, Mar 05, 2015 at 02:56:52AM +0000, Song, Ruiling wrote:
> > > > Hi Daniel,
> > > >
> > > > OpenCL language support NULL pointer, using zero as the NULL
> > > > pointer is the obvious way. That is zero will be treated as
> > > > invalid address.  Then it requires drm won't allocate zero to drm
> > > > buffer. And David in CC list has help us make a patch, please see
> > > > attached. The logic is only for ppgtt, and he said zero offset is
> > > > used under ggtt. My question is what is offset zero used under
> > > > ggtt? Will it make sure zero is not allocatable to drm buffer object?
> > >
> > > The code in i915_gem_execbuf.c already supports an optional bias to
> > > avoid putting a buffer into the first few kb. See
> > > __EXEC_OBJECT_NEEDS_BIAS. I suggest you expose this to userspace,
> > > which also address your issue that you didn't add an abi revision flag.
> >
> > A better API would be to allow userspace to request a buffer to place
> > at a specific point in the VM and fail if that is not possible aka
> > soft-pinning. Then OCL could assign a bo to offset 0 and detect writes
> > to the NULL address if it so desired. With full-ppgtt, userspace can
> > be sure of being able to evict any location in its VM and so also
> > allows graceful detection of scenarios under which it cannot provide
> > the NULL address safety feature (and opt not to run, or just bury its
> > head in the sand).
> 
> I recommended exposing the PIN_BIAS since that will work without full ppgtt
> too. And yeah for full ppgtt we could just use svm where userspace controls the
> address, but since that's still a bit out we might need a quick interim solution?
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From junyan.he at inbox.com  Thu Mar  5 23:22:38 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:22:38 +0800
Subject: [Beignet] [PATCH 1/7] Backend: Add the indirect fields and
	functions for gen register.
Message-ID: <1425626558-25652-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Add a0_subnr and addr_imm to GenRegister, in order to
represent the indirect register, which may be some
imm offset from a0.x subregister's base address.
Also add to_indirect1xN help function to convert a register
to an indirect 1XN register.

V2:
   1. Add Gen8 encoder setting.
   2. Reorder the patches.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_register.hpp |   30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 08c7277..3b40b67 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -205,6 +205,8 @@ namespace gbe
       this->quarter = 0;
       this->nr = this->subnr = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! For specific physical registers only */
@@ -229,6 +231,8 @@ namespace gbe
       this->hstride = hstride;
       this->quarter = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! Return the IR virtual register */
@@ -258,6 +262,8 @@ namespace gbe
     uint32_t hstride:2;      //!< Horizontal stride
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
+    uint32_t a0_subnr:4;     //!< In indirect mode, use a0.nr as the base.
+    int32_t addr_imm:10;     //!< In indirect mode, the imm as address offset from a0.
 
     static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
       GenRegister r = reg;
@@ -835,6 +841,28 @@ namespace gbe
       return reg;
     }
 
+    /*! convert one register to indirectly mode */
+    static INLINE GenRegister to_indirect1xN(GenRegister reg, uint32_t base_addr,
+                                          int32_t imm_off = 4096, int a0_subnr = 0) {
+      GenRegister r = reg;
+      int32_t offset;
+      if (imm_off > 4095) {
+        offset = (r.nr*32 + r.subnr) - base_addr;
+      } else {
+        offset = imm_off;
+      }
+
+      GBE_ASSERT(offset <= 511 && offset>=-512);
+      r.a0_subnr = a0_subnr;
+      r.addr_imm = offset;
+      r.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+
+      r.width = GEN_WIDTH_1;
+      r.vstride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      r.hstride = GEN_HORIZONTAL_STRIDE_0;
+      return r;
+    }
+
     static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return GenRegister(file,
                          nr,
@@ -953,7 +981,7 @@ namespace gbe
     }
 
     static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
-      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+      return offset(retype(vec1(file, nr, 0), GEN_TYPE_UW), 0, typeSize(GEN_TYPE_UW)*subnr);
     }
 
     static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:23:35 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:23:35 +0800
Subject: [Beignet] [V2 PATCH 1/7] Backend: Add the indirect fields and
	functions for gen register.
Message-ID: <1425626615-25704-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Add a0_subnr and addr_imm to GenRegister, in order to
represent the indirect register, which may be some
imm offset from a0.x subregister's base address.
Also add to_indirect1xN help function to convert a register
to an indirect 1XN register.

V2:
   1. Add Gen8 encoder setting.
   2. Reorder the patches.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_register.hpp |   30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 08c7277..3b40b67 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -205,6 +205,8 @@ namespace gbe
       this->quarter = 0;
       this->nr = this->subnr = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! For specific physical registers only */
@@ -229,6 +231,8 @@ namespace gbe
       this->hstride = hstride;
       this->quarter = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! Return the IR virtual register */
@@ -258,6 +262,8 @@ namespace gbe
     uint32_t hstride:2;      //!< Horizontal stride
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
+    uint32_t a0_subnr:4;     //!< In indirect mode, use a0.nr as the base.
+    int32_t addr_imm:10;     //!< In indirect mode, the imm as address offset from a0.
 
     static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
       GenRegister r = reg;
@@ -835,6 +841,28 @@ namespace gbe
       return reg;
     }
 
+    /*! convert one register to indirectly mode */
+    static INLINE GenRegister to_indirect1xN(GenRegister reg, uint32_t base_addr,
+                                          int32_t imm_off = 4096, int a0_subnr = 0) {
+      GenRegister r = reg;
+      int32_t offset;
+      if (imm_off > 4095) {
+        offset = (r.nr*32 + r.subnr) - base_addr;
+      } else {
+        offset = imm_off;
+      }
+
+      GBE_ASSERT(offset <= 511 && offset>=-512);
+      r.a0_subnr = a0_subnr;
+      r.addr_imm = offset;
+      r.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+
+      r.width = GEN_WIDTH_1;
+      r.vstride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      r.hstride = GEN_HORIZONTAL_STRIDE_0;
+      return r;
+    }
+
     static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return GenRegister(file,
                          nr,
@@ -953,7 +981,7 @@ namespace gbe
     }
 
     static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
-      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+      return offset(retype(vec1(file, nr, 0), GEN_TYPE_UW), 0, typeSize(GEN_TYPE_UW)*subnr);
     }
 
     static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:23:41 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:23:41 +0800
Subject: [Beignet] [V2 PATCH 2/7] Backend: Add functions to set a0 register.
Message-ID: <1425626621-25738-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

a0 as a address register acts a very important role in
indirect mode access. We add auxiliary functions to set
its content correctly and effectively.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp |   42 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen_context.hpp |    3 +++
 2 files changed, 45 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index f8748ad..6856510 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -1801,6 +1801,48 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
+  void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+    if (sz == 0)
+      sz = 8;
+    GBE_ASSERT(sz%2 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(a0[0] + diff < 4096 && a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset) {
+      p->push();
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/2; i++) {
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+            GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(char)*8);
+    memset(&this->a0[8], 0, sizeof(char)*8);
+  }
+
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index f64b916..6ca88db 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -208,6 +208,9 @@ namespace gbe
     /*! allocate a new curbe register and insert to curbe pool. */
     void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
 
+    uint16_t a0[16];
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:23:47 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:23:47 +0800
Subject: [Beignet] [V2 PATCH 3/7] Backend: Correct indirect mode encoder
	setting for Gen7.
Message-ID: <1425626627-25772-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen7_encoder.cpp |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index ecf5b39..a7d132c 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -118,14 +118,14 @@ namespace gbe
     } else {
       gen7_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen7_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen7_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen7_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen7_insn->bits2.ia1.src0_abs = 0;
-      gen7_insn->bits2.ia1.src0_negate = 0;
+      gen7_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen7_insn->bits2.ia1.src0_indirect_offset = reg.addr_imm;
+      gen7_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen7_insn->bits2.ia1.src0_negate = reg.negation;
       gen7_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen7_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen7_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen7_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen7_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen7_insn->bits2.ia1.src0_width = reg.width;
+      gen7_insn->bits2.ia1.src0_vert_stride = reg.vstride;
     }
   }
 
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:23:53 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:23:53 +0800
Subject: [Beignet] [V2 PATCH 4/7] Backend: Correct indirect mode encoder
	setting for Gen8.
Message-ID: <1425626633-25806-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen8_encoder.cpp |   15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 92aad64..48419aa 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -369,14 +369,15 @@ namespace gbe
     } else {
       gen8_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen8_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen8_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen8_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen8_insn->bits2.ia1.src0_abs = 0;
-      gen8_insn->bits2.ia1.src0_negate = 0;
+      gen8_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen8_insn->bits2.ia1.src0_indirect_offset = (reg.addr_imm & 0x1ff);
+      gen8_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen8_insn->bits2.ia1.src0_negate = reg.negation;
       gen8_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen8_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen8_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen8_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen8_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen8_insn->bits2.ia1.src0_width = reg.width;
+      gen8_insn->bits2.ia1.src0_vert_stride = reg.vstride;
+      gen8_insn->bits2.ia1.src0_indirect_offset_9 = (reg.addr_imm & 0x02) >> 9;
     }
   }
 
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:24:00 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:24:00 +0800
Subject: [Beignet] [V2 PATCH 5/7] Backend: Handle the bswap using indirect
	mode access.
Message-ID: <1425626640-25840-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

The swap for short will be like:
mov(1)   a0<1>:UD        0xe600e61UD            { align1 WE_all };
mov(1)   a0.1<1>:UD      0xe620e63UD            { align1 WE_all };
mov(1)   a0.2<1>:UD      0xe640e65UD            { align1 WE_all };
mov(1)   a0.3<1>:UD      0xe660e67UD            { align1 WE_all };
mov(8)   g114<1>:UB      g[a0]<VxH,1,0>:UB      { align1 WE_all 1Q };
mov(8)   g114.8<1>:UB    g[a0 8]<VxH,1,0>:UB    { align1 WE_all 1Q };
mov(8)   g114.16<1>:UB   g[a0 16]<VxH,1,0>:UB   { align1 WE_all 1Q };
mov(8)   g114.24<1>:UB   g[a0 24]<VxH,1,0>:UB   { align1 WE_all 1Q };
mov(16)  g113<1>:UW      g114<8,8,1>:UW         { align1 WE_normal 1H };

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp        |  112 ++++++++++++++++++++++++++++
 backend/src/backend/gen_insn_selection.cpp |    9 +++
 backend/src/backend/gen_insn_selection.hxx |    1 +
 3 files changed, 122 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 6856510..46b4a06 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -297,6 +297,118 @@ namespace gbe
           p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
         break;
       }
+      case SEL_OP_BSWAP: {
+        uint32_t simd = p->curr.execWidth;
+        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+        uint16_t new_a0[16];
+        memset(new_a0, 0, sizeof(new_a0));
+
+        GBE_ASSERT(src.type == dst.type);
+        uint32_t start_addr = src.nr*32 + src.subnr;
+
+        if (simd == 1) {
+          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            this->setA0Content(new_a0, 0, 4);
+
+            p->push();
+            p->curr.execWidth = 4;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            GenRegister dst_ = dst;
+            dst_.type = GEN_TYPE_UB;
+            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+            dst_.width = GEN_WIDTH_4;
+            dst_.vstride = GEN_VERTICAL_STRIDE_4;
+            p->MOV(dst_, ind_src);
+            p->pop();
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                GenRegister::retype(src, GEN_TYPE_UB));
+          } else {
+            GBE_ASSERT(0);
+          }
+        } else {
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            GBE_ASSERT(src.subnr == 0);
+            GBE_ASSERT(dst.subnr == 0);
+            GBE_ASSERT(tmp.subnr == 0);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            new_a0[4] = start_addr + 7;
+            new_a0[5] = start_addr + 6;
+            new_a0[6] = start_addr + 5;
+            new_a0[7] = start_addr + 4;
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < 4; i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            if (simd == 16) {
+              for (int i = 0; i < 4; i++) {
+                ind_src.addr_imm += 8;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
+              }
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
+            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 1;
+            new_a0[1] = start_addr;
+            new_a0[2] = start_addr + 3;
+            new_a0[3] = start_addr + 2;
+            new_a0[4] = start_addr + 5;
+            new_a0[5] = start_addr + 4;
+            new_a0[6] = start_addr + 7;
+            new_a0[7] = start_addr + 6;
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else {
+            GBE_ASSERT(0);
+          }
+        }
+      }
+      break;
       default:
         NOT_IMPLEMENTED;
     }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d100f80..2b166b1 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -498,6 +498,7 @@ namespace gbe
     ALU1(RNDE)
     ALU1(F16TO32)
     ALU1(F32TO16)
+    ALU1WithTemp(BSWAP)
     ALU2(SEL)
     ALU2(SEL_INT64)
     ALU1(NOT)
@@ -2121,6 +2122,14 @@ namespace gbe
           case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
           case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
           case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+          case ir::OP_BSWAP:
+            {
+              ir::Register tmp = sel.reg(getFamily(insnType));
+              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
+              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
+              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
+              break;
+            }
           case ir::OP_SIMD_ANY:
             {
               const GenRegister constZero = GenRegister::immuw(0);;
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index be1f7ec..09f5aaf 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,6 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
 DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:24:07 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:24:07 +0800
Subject: [Beignet] [V2 PATCH 6/7] Backend: Delete bswap logic in the
	llvm_to_gen stage.
Message-ID: <1425626647-25874-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

We delete bswap when llvm_to_gen, and add BSWAP
instruction to handle. We will handle the bswap
in backend as a special instruction.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/ir/instruction.hpp        |    2 +
 backend/src/ir/instruction.hxx        |    1 +
 backend/src/llvm/llvm_gen_backend.cpp |   85 +--------------------------------
 3 files changed, 5 insertions(+), 83 deletions(-)

diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 6963111..24d27aa 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -620,6 +620,8 @@ namespace ir {
   Instruction RNDU(Type type, Register dst, Register src);
   /*! rndz.type dst src */
   Instruction RNDZ(Type type, Register dst, Register src);
+  /*! bswap.type dst src */
+  Instruction BSWAP(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b52673e..de4abfb 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -40,6 +40,7 @@ DECL_INSN(RNDU, UnaryInstruction)
 DECL_INSN(RNDZ, UnaryInstruction)
 DECL_INSN(SIMD_ANY, UnaryInstruction)
 DECL_INSN(SIMD_ALL, UnaryInstruction)
+DECL_INSN(BSWAP, UnaryInstruction)
 DECL_INSN(POW, BinaryInstruction)
 DECL_INSN(MUL, BinaryInstruction)
 DECL_INSN(ADD, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index aad638f..74c80ee 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2943,89 +2943,6 @@ namespace gbe
           case Intrinsic::umul_with_overflow:
           NOT_IMPLEMENTED;
           break;
-          case Intrinsic::bswap:
-          {
-            // FIXME, this is an unoptimized version, could be optimized by
-            // leveraging GEN's register region/indirect address feature.
-            Type *llvmDstType = I.getType();
-            uint32_t elementSize = getTypeByteSize(unit, llvmDstType);
-
-            const ir::Register dst0  = this->getRegister(&I);
-            const ir::Register src0 = this->getRegister(I.getOperand(0));
-            switch(elementSize)
-            {
-              case 2:
-                {
-                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regWMask = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x00FF, ir::TYPE_S16);
-                  ir::Register regShift = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex shift = ctx.newIntegerImmediate(8, ir::TYPE_S16);
-
-                  ctx.LOADI(ir::TYPE_S16, regWMask, wMask);
-                  ctx.AND(srcType, tmp1, src0, regWMask);
-
-                  ctx.LOADI(ir::TYPE_S16, regShift, shift);
-                  ctx.SHL(srcType, tmp2, tmp1, regShift);
-
-                  ir::Register tmp3 = ctx.reg( getFamily(srcType) );
-                  ctx.SHR(srcType, tmp3, src0, regShift);
-
-                  ctx.OR(srcType, dst0, tmp2, tmp3);
-                }
-                break;
-              case 4:
-                {
-                  ir::Type srcType = getType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp3 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp4 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp5 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp6 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regDWMask = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_8 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_24 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::ImmediateIndex wMask_L = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
-                  ir::ImmediateIndex wMask_H = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_8 = ctx.newIntegerImmediate(8, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_24 = ctx.newIntegerImmediate(24, ir::TYPE_S32);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHL(srcType, tmp1, src0, regShift_24);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_L);
-                  ctx.AND(srcType, tmp2, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHL(srcType, tmp3, tmp2, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_H);
-                  ctx.AND(srcType, tmp4, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp5, tmp4, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp6, src0, regShift_24);
-
-                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
-                  ctx.OR(srcType, tmp7, tmp1, tmp3);
-                  ctx.OR(srcType, tmp8, tmp5, tmp6);
-                  ctx.OR(srcType, dst0, tmp7, tmp8);
-                }
-                break;
-              case 8:
-                NOT_IMPLEMENTED;
-                break;
-              default:
-                GBE_ASSERT(0);
-            }
-          }
-          break;
           case Intrinsic::ctlz:
           {
             Type *llvmDstType = I.getType();
@@ -3085,6 +3002,8 @@ namespace gbe
           case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
           case Intrinsic::exp2: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
+          case Intrinsic::bswap:
+            this->emitUnaryCallInst(I,CS,ir::OP_BSWAP, getUnsignedType(ctx, I.getType())); break;
           default: NOT_IMPLEMENTED;
         }
       } else {
-- 
1.7.9.5


From junyan.he at inbox.com  Thu Mar  5 23:24:13 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Fri,  6 Mar 2015 15:24:13 +0800
Subject: [Beignet] [V2 PATCH 7/7] utest: Update the test case for bswap.
Message-ID: <1425626653-25908-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 kernels/compiler_bswap.cl |   24 +++---
 utests/compiler_bswap.cpp |  203 +++++++++++++++++++++++++++++++--------------
 2 files changed, 156 insertions(+), 71 deletions(-)

diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
index 97313b1..3a0a373 100644
--- a/kernels/compiler_bswap.cl
+++ b/kernels/compiler_bswap.cl
@@ -1,13 +1,17 @@
-#define TEST_TYPE(TYPE, LENGTH)                                       \
-kernel void compiler_bswap_##TYPE(global TYPE * src, global TYPE * dst){ \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(src[get_global_id(0)]); \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(dst[get_global_id(0)] -1 ); \
-}
+kernel void compiler_bswap(global uint * src0, global uint * dst0, global ushort * src1, global ushort * dst1,
+    int src2, global int * dst2,  short src3, global short * dst3) {
+  if (get_global_id(0) % 2 == 0) {
+    dst0[get_global_id(0)] = __builtin_bswap32(src0[get_global_id(0)]);
+  } else {
+    dst0[get_global_id(0)] = src0[get_global_id(0)];
+  }
 
+  dst1[get_global_id(0)] = __builtin_bswap16(src1[get_global_id(0)]);
+  if (get_global_id(0) % 2 == 1) {
+    dst1[get_global_id(0)] = __builtin_bswap16(dst1[get_global_id(0)] + 1);
+  }
 
-TEST_TYPE(short, 16)
-TEST_TYPE(ushort, 16)
-TEST_TYPE(int, 32)
-TEST_TYPE(uint, 32)
+  dst2[get_global_id(0)] = __builtin_bswap32(src2);
+  dst3[get_global_id(0)] = __builtin_bswap16(src3);
+}
 
-#undef TEST_TYPE
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
index 9475b99..3af9ef5 100644
--- a/utests/compiler_bswap.cpp
+++ b/utests/compiler_bswap.cpp
@@ -1,7 +1,6 @@
 #include "utest_helper.hpp"
 #include "string.h"
 
-namespace {
 #define cpu_htons(A)     ((((uint16_t)(A) & 0xff00) >> 8) | \
     (((uint16_t)(A) & 0x00ff) << 8))
 #define cpu_htonl(A)     ((((uint32_t)(A) & 0xff000000) >> 24) | \
@@ -9,108 +8,190 @@ namespace {
     (((uint32_t)(A) & 0x0000ff00) << 8) | \
     (((uint32_t)(A) & 0x000000ff) << 24))
 
+
+template <typename T> static void gen_rand_val(T & val)
+{
+  val = static_cast<T>(rand());//(0xAABBCCDD);//
+}
+
 template <typename T> static void cpu(int global_id, T *src, T *dst)
 {
-    T f = src[global_id];
-    T g = 0;
-    if(sizeof(T) == sizeof(int16_t))
-      g = cpu_htons(f);
-    else if(sizeof(T) == sizeof(int32_t))
-      g = cpu_htonl(f);
-    dst[global_id] = g;
+  T f = src[global_id];
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T> static void gen_rand_val (T & val)
+template <typename T> static void cpu(int global_id, T src, T *dst)
 {
-    val = static_cast<T>(rand() );
+  T f = src;
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T>
-inline static void print_data (T& val)
+template <typename T> inline static void print_data(T& val)
 {
-    if(sizeof(T) == sizeof(uint16_t))
-        printf(" %hx", val);
-    else
-        printf(" %x", val);
+  if(sizeof(T) == sizeof(uint16_t))
+    printf(" 0x%hx", val);
+  else
+    printf(" 0x%x", val);
 }
 
-template <typename T> static void dump_data (T* src, T* dst, int n)
+template <typename T> static void dump_data(T* raw, T* cpu, T* gpu, int n)
 {
-    printf("\nRaw: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[0])[i]);
-    }
+  printf("\nRaw: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(raw[i]);
+  }
 
-    printf("\nCPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(dst[i]);
-    }
-    printf("\nGPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[1])[i]);
-    }
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
 }
 
-template<typename T>
-void test(const char *kernel_name)
+template <typename T> static void dump_data(T raw, T* cpu, T* gpu, int n)
 {
-  const size_t n = 64;
-  T cpu_dst[n];
-  T cpu_src[n];
+  printf("\nRaw: \n");
+  print_data(raw);
+
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
+}
+
+void compiler_bswap(void)
+{
+  const size_t n = 32;
+  uint32_t src0[n];
+  uint16_t src1[n];
+  uint32_t dst0[n];
+  uint16_t dst1[n];
+  int32_t src2 = static_cast<int32_t>(rand());
+  int32_t dst2[n];
+  int16_t src3 = static_cast<int16_t>(rand());
+  int16_t dst3[n];
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", kernel_name);
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", "compiler_bswap");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(src0), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, sizeof(dst0), NULL);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
 
+  OCL_CREATE_BUFFER(buf[2], 0, sizeof(src1), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_CREATE_BUFFER(buf[3], 0, sizeof(dst1), NULL);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+  OCL_SET_ARG(4, sizeof(int32_t), &src2);
+  OCL_CREATE_BUFFER(buf[4], 0, sizeof(dst2), NULL);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+
+  OCL_SET_ARG(6, sizeof(int16_t), &src3);
+  OCL_CREATE_BUFFER(buf[5], 0, sizeof(dst3), NULL);
+  OCL_SET_ARG(7, sizeof(cl_mem), &buf[5]);
+
   OCL_MAP_BUFFER(0);
   for (int32_t i = 0; i < (int32_t) n; ++i) {
-    gen_rand_val(cpu_src[i]);
+    gen_rand_val(src0[i]);
   }
-
-  memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+  memcpy(buf_data[0], src0, sizeof(src0));
+  OCL_UNMAP_BUFFER(0);
 
   /* Clear the dst buffer to avoid random data. */
   OCL_MAP_BUFFER(1);
-  memset(buf_data[1], 0, sizeof(T) * n);
+  memset(buf_data[1], 0, sizeof(dst0));
   OCL_UNMAP_BUFFER(1);
 
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    gen_rand_val(src1[i]);
+  }
+  memcpy(buf_data[2], src1, sizeof(src1));
+  OCL_UNMAP_BUFFER(2);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(3);
+  memset(buf_data[3], 0, sizeof(dst1));
+  OCL_UNMAP_BUFFER(3);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(4);
+  memset(buf_data[4], 0, sizeof(dst2));
+  OCL_UNMAP_BUFFER(4);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(5);
+  memset(buf_data[5], 0, sizeof(dst3));
+  OCL_UNMAP_BUFFER(5);
+
   globals[0] = n;
   locals[0] = 16;
   OCL_NDRANGE(1);
 
   // Run on CPU
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_src, cpu_dst);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%2) {
+      dst0[i] = src0[i];
+      continue;
+    }
+    cpu(i, src0, dst0);
+  }
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu(i, src1, dst1);
+
+    if (i%2) {
+      dst1[i] = dst1[i] + 1;
+      cpu(i, dst1, dst1);
+    }
+  }
 
+  // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu_dst[i] = cpu_dst[i] -1;
+    cpu(i, src2, dst2);
 
   // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_dst, cpu_dst);
+    cpu(i, src3, dst3);
 
   OCL_MAP_BUFFER(1);
- // dump_data(cpu_src, cpu_dst, n);
+  //dump_data(src0, dst0, (uint32_t *)buf_data[1], n);
+  OCL_ASSERT(!memcmp(buf_data[1], dst0, sizeof(dst0)));
+  OCL_UNMAP_BUFFER(1);
 
-  OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+  OCL_MAP_BUFFER(3);
+  //dump_data(src1, dst1, (uint16_t *)buf_data[3], n);
+  OCL_ASSERT(!memcmp(buf_data[3], dst1, sizeof(dst1)));
+  OCL_UNMAP_BUFFER(3);
 
-  OCL_UNMAP_BUFFER(1);
-  OCL_UNMAP_BUFFER(0);
-}
+  OCL_MAP_BUFFER(4);
+  //dump_data(src2, dst2, (int32_t *)buf_data[4], n);
+  OCL_ASSERT(!memcmp(buf_data[4], dst2, sizeof(dst2)));
+  OCL_UNMAP_BUFFER(4);
 
+  OCL_MAP_BUFFER(5);
+  //dump_data(src3, dst3, (int16_t *)buf_data[5], n);
+  OCL_ASSERT(!memcmp(buf_data[5], dst3, sizeof(dst3)));
+  OCL_UNMAP_BUFFER(5);
 }
 
-#define compiler_bswap(type, kernel) \
-static void compiler_bswap_ ##type(void)\
-{\
-  test<type>(# kernel);\
-}\
-MAKE_UTEST_FROM_FUNCTION(compiler_bswap_ ## type);
-
-compiler_bswap(int16_t, compiler_bswap_short)
-compiler_bswap(uint16_t, compiler_bswap_ushort)
-compiler_bswap(int32_t, compiler_bswap_int)
-compiler_bswap(uint32_t, compiler_bswap_uint)
+MAKE_UTEST_FROM_FUNCTION(compiler_bswap);
-- 
1.7.9.5


From chris at chris-wilson.co.uk  Fri Mar  6 00:39:19 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Fri, 6 Mar 2015 08:39:19 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
Message-ID: <20150306083919.GH18784@nuc-i3427.alporthouse.com>

On Fri, Mar 06, 2015 at 02:11:18AM +0000, Zou, Nanhai wrote:
> I don't understand why we need a complex solution when there is already a simple solution with patch.
> What is the drawback of reserving page 0?
> Before we going to that complex solution, could we just reserve page zero?
> It is simple and straight forward.

Because it is a nonsense ABI constraint. If you want the equivalent
of MAP_FIXED, we should give you MAP_FIXED.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From jeff.mcgee at intel.com  Fri Mar  6 10:44:18 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Fri, 6 Mar 2015 10:44:18 -0800
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <20150305043555.GA20578@ivb-gt2-rev4>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
 <20150305043555.GA20578@ivb-gt2-rev4>
Message-ID: <20150306184418.GC3263@jeffdesk>

On Thu, Mar 05, 2015 at 12:35:55PM +0800, Zhigang Gong wrote:
> There is one minor conflict when apply the KMD patch to latest
> drm-intel-nightly branch. It should be easy to fix.
> 
> Another issue is that IMO, we should bump libdrm's version number
> when increase these new APIs. Then in Beignet, we can check the
> libdrm version at build time and determine whether we will use
> these new interfaces. Thus, we can avoid breaking beignet on
> those systems which have previous libdrm/kernel installed.
> 
Right. I can append a libdrm patch to bump the version. And then I
suppose I will follow the process to make a new release. Not sure
right now how that works. First time going through it.

Also, how should we test for the libdrm version and conditionally
use the API? Is there a previous example of this in Beignet that I
could follow?

Jeff

> The other parts of the whole patchset,
> including patches for KMD/libdrm/Intel gpu tools and Beignet,
> all look good to me.
> 
> And I just tested it on BDW and SKL platforms, it works fine.
> 
> Thanks,
> Zhigang Gong.
> 
> On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> > From: Jeff McGee <jeff.mcgee at intel.com>
> > 
> > Setup new I915_GETPARAM ioctl entries for subslice total and
> > EU total. Userspace drivers need these values when constructing
> > GPGPU commands. This kernel query method is intended to replace
> > the PCI ID-based tables that userspace drivers currently maintain.
> > The kernel driver can employ fuse register reads as needed to
> > ensure the most accurate determination of GT config attributes.
> > This first became important with Cherryview in which the config
> > could differ between devices with the same PCI ID.
> > 
> > The kernel detection of these values is device-specific and not
> > included in this patch. Because zero is not a valid value for any of
> > these parameters, a value of zero is interpreted as unknown for the
> > device. Userspace drivers should continue to maintain ID-based tables
> > for older devices not supported by the new query method.
> > 
> > For: VIZ-4636
> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
> >  include/uapi/drm/i915_drm.h     |  2 ++
> >  2 files changed, 12 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 053e178..9350ea2 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
> >  	case I915_PARAM_MMAP_VERSION:
> >  		value = 1;
> >  		break;
> > +	case I915_PARAM_SUBSLICE_TOTAL:
> > +		value = INTEL_INFO(dev)->subslice_total;
> > +		if (!value)
> > +			return -ENODEV;
> > +		break;
> > +	case I915_PARAM_EU_TOTAL:
> > +		value = INTEL_INFO(dev)->eu_total;
> > +		if (!value)
> > +			return -ENODEV;
> > +		break;
> >  	default:
> >  		DRM_DEBUG("Unknown parameter %d\n", param->param);
> >  		return -EINVAL;
> > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > index 6eed16b..8672efc 100644
> > --- a/include/uapi/drm/i915_drm.h
> > +++ b/include/uapi/drm/i915_drm.h
> > @@ -347,6 +347,8 @@ typedef struct drm_i915_irq_wait {
> >  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
> >  #define I915_PARAM_MMAP_VERSION          30
> >  #define I915_PARAM_HAS_BSD2		 31
> > +#define I915_PARAM_SUBSLICE_TOTAL	 32
> > +#define I915_PARAM_EU_TOTAL		 33
> >  
> >  typedef struct drm_i915_getparam {
> >  	int param;
> > -- 
> > 2.3.0
> > 
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From jeff.mcgee at intel.com  Fri Mar  6 11:23:45 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Fri, 6 Mar 2015 11:23:45 -0800
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <20150305043555.GA20578@ivb-gt2-rev4>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
 <20150305043555.GA20578@ivb-gt2-rev4>
Message-ID: <20150306192344.GD3263@jeffdesk>

On Thu, Mar 05, 2015 at 12:35:55PM +0800, Zhigang Gong wrote:
> There is one minor conflict when apply the KMD patch to latest
> drm-intel-nightly branch. It should be easy to fix.
> 
> Another issue is that IMO, we should bump libdrm's version number
> when increase these new APIs. Then in Beignet, we can check the
> libdrm version at build time and determine whether we will use
> these new interfaces. Thus, we can avoid breaking beignet on
> those systems which have previous libdrm/kernel installed.
> 
> The other parts of the whole patchset,
> including patches for KMD/libdrm/Intel gpu tools and Beignet,
> all look good to me.
> 
> And I just tested it on BDW and SKL platforms, it works fine.
> 

Can you add your Reviewed-by tag to at least the Beignet patches?
I think Daniel wants to see that before moving forward with the
rest. Thanks

Jeff

> Thanks,
> Zhigang Gong.
> 
> On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> > From: Jeff McGee <jeff.mcgee at intel.com>
> > 
> > Setup new I915_GETPARAM ioctl entries for subslice total and
> > EU total. Userspace drivers need these values when constructing
> > GPGPU commands. This kernel query method is intended to replace
> > the PCI ID-based tables that userspace drivers currently maintain.
> > The kernel driver can employ fuse register reads as needed to
> > ensure the most accurate determination of GT config attributes.
> > This first became important with Cherryview in which the config
> > could differ between devices with the same PCI ID.
> > 
> > The kernel detection of these values is device-specific and not
> > included in this patch. Because zero is not a valid value for any of
> > these parameters, a value of zero is interpreted as unknown for the
> > device. Userspace drivers should continue to maintain ID-based tables
> > for older devices not supported by the new query method.
> > 
> > For: VIZ-4636
> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
> >  include/uapi/drm/i915_drm.h     |  2 ++
> >  2 files changed, 12 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> > index 053e178..9350ea2 100644
> > --- a/drivers/gpu/drm/i915/i915_dma.c
> > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > @@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
> >  	case I915_PARAM_MMAP_VERSION:
> >  		value = 1;
> >  		break;
> > +	case I915_PARAM_SUBSLICE_TOTAL:
> > +		value = INTEL_INFO(dev)->subslice_total;
> > +		if (!value)
> > +			return -ENODEV;
> > +		break;
> > +	case I915_PARAM_EU_TOTAL:
> > +		value = INTEL_INFO(dev)->eu_total;
> > +		if (!value)
> > +			return -ENODEV;
> > +		break;
> >  	default:
> >  		DRM_DEBUG("Unknown parameter %d\n", param->param);
> >  		return -EINVAL;
> > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > index 6eed16b..8672efc 100644
> > --- a/include/uapi/drm/i915_drm.h
> > +++ b/include/uapi/drm/i915_drm.h
> > @@ -347,6 +347,8 @@ typedef struct drm_i915_irq_wait {
> >  #define I915_PARAM_HAS_COHERENT_PHYS_GTT 29
> >  #define I915_PARAM_MMAP_VERSION          30
> >  #define I915_PARAM_HAS_BSD2		 31
> > +#define I915_PARAM_SUBSLICE_TOTAL	 32
> > +#define I915_PARAM_EU_TOTAL		 33
> >  
> >  typedef struct drm_i915_getparam {
> >  	int param;
> > -- 
> > 2.3.0
> > 
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Sun Mar  8 17:10:06 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 9 Mar 2015 08:10:06 +0800
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <20150306184418.GC3263@jeffdesk>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
 <20150305043555.GA20578@ivb-gt2-rev4> <20150306184418.GC3263@jeffdesk>
Message-ID: <016001d059fd$66ef9680$34cec380$@linux.intel.com>

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Jeff McGee
> Sent: Saturday, March 7, 2015 2:44 AM
> To: Zhigang Gong
> Cc: daniel at ffwll.ch; intel-gfx at lists.freedesktop.org;
> beignet at lists.freedesktop.org; dri-devel at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
> 
> On Thu, Mar 05, 2015 at 12:35:55PM +0800, Zhigang Gong wrote:
> > There is one minor conflict when apply the KMD patch to latest
> > drm-intel-nightly branch. It should be easy to fix.
> >
> > Another issue is that IMO, we should bump libdrm's version number when
> > increase these new APIs. Then in Beignet, we can check the libdrm
> > version at build time and determine whether we will use these new
> > interfaces. Thus, we can avoid breaking beignet on those systems which
> > have previous libdrm/kernel installed.
> >
> Right. I can append a libdrm patch to bump the version. And then I suppose I
> will follow the process to make a new release. Not sure right now how that
> works. First time going through it.
> 
> Also, how should we test for the libdrm version and conditionally use the API?
We can check the libdrm version at configuration time and define a macro to
indicate whether we can use these new APIs in beignet.
> Is there a previous example of this in Beignet that I could follow?
Yes, one example is userptr. You can check the usage of DRM_INTEL_USERPTR and HAS_USERPTR
In beignet.

Thanks,
Zhigang Gong.

> 
> Jeff
> 
> > The other parts of the whole patchset, including patches for
> > KMD/libdrm/Intel gpu tools and Beignet, all look good to me.
> >
> > And I just tested it on BDW and SKL platforms, it works fine.
> >
> > Thanks,
> > Zhigang Gong.
> >
> > On Mon, Mar 02, 2015 at 03:37:32PM -0800, jeff.mcgee at intel.com wrote:
> > > From: Jeff McGee <jeff.mcgee at intel.com>
> > >
> > > Setup new I915_GETPARAM ioctl entries for subslice total and EU
> > > total. Userspace drivers need these values when constructing GPGPU
> > > commands. This kernel query method is intended to replace the PCI
> > > ID-based tables that userspace drivers currently maintain.
> > > The kernel driver can employ fuse register reads as needed to ensure
> > > the most accurate determination of GT config attributes.
> > > This first became important with Cherryview in which the config
> > > could differ between devices with the same PCI ID.
> > >
> > > The kernel detection of these values is device-specific and not
> > > included in this patch. Because zero is not a valid value for any of
> > > these parameters, a value of zero is interpreted as unknown for the
> > > device. Userspace drivers should continue to maintain ID-based
> > > tables for older devices not supported by the new query method.
> > >
> > > For: VIZ-4636
> > > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
> > >  include/uapi/drm/i915_drm.h     |  2 ++
> > >  2 files changed, 12 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/i915/i915_dma.c
> > > b/drivers/gpu/drm/i915/i915_dma.c index 053e178..9350ea2 100644
> > > --- a/drivers/gpu/drm/i915/i915_dma.c
> > > +++ b/drivers/gpu/drm/i915/i915_dma.c
> > > @@ -150,6 +150,16 @@ static int i915_getparam(struct drm_device *dev,
> void *data,
> > >  	case I915_PARAM_MMAP_VERSION:
> > >  		value = 1;
> > >  		break;
> > > +	case I915_PARAM_SUBSLICE_TOTAL:
> > > +		value = INTEL_INFO(dev)->subslice_total;
> > > +		if (!value)
> > > +			return -ENODEV;
> > > +		break;
> > > +	case I915_PARAM_EU_TOTAL:
> > > +		value = INTEL_INFO(dev)->eu_total;
> > > +		if (!value)
> > > +			return -ENODEV;
> > > +		break;
> > >  	default:
> > >  		DRM_DEBUG("Unknown parameter %d\n", param->param);
> > >  		return -EINVAL;
> > > diff --git a/include/uapi/drm/i915_drm.h
> > > b/include/uapi/drm/i915_drm.h index 6eed16b..8672efc 100644
> > > --- a/include/uapi/drm/i915_drm.h
> > > +++ b/include/uapi/drm/i915_drm.h
> > > @@ -347,6 +347,8 @@ typedef struct drm_i915_irq_wait {  #define
> > > I915_PARAM_HAS_COHERENT_PHYS_GTT 29
> > >  #define I915_PARAM_MMAP_VERSION          30
> > >  #define I915_PARAM_HAS_BSD2		 31
> > > +#define I915_PARAM_SUBSLICE_TOTAL	 32
> > > +#define I915_PARAM_EU_TOTAL		 33
> > >
> > >  typedef struct drm_i915_getparam {
> > >  	int param;
> > > --
> > > 2.3.0
> > >
> > > _______________________________________________
> > > Beignet mailing list
> > > Beignet at lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/beignet
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From zhigang.gong at linux.intel.com  Sun Mar  8 17:21:18 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 9 Mar 2015 08:21:18 +0800
Subject: [Beignet] [PATCH 1/2] Add driver callback for updating device
	info
In-Reply-To: <1425339759-19027-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339759-19027-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <016101d059fe$f78aac10$e6a00430$@linux.intel.com>

This patchset is a must for beignet to support CHV. One comment is that we should
put the usage of these new libdrm APIs to conditional block thus we don't break the
build on old system.

For the other parts of the patchset:

Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>

Thanks,
Zhigang Gong.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> jeff.mcgee at intel.com
> Sent: Tuesday, March 3, 2015 7:43 AM
> To: beignet at lists.freedesktop.org
> Cc: intel-gfx at lists.freedesktop.org; dri-devel at lists.freedesktop.org
> Subject: [Beignet] [PATCH 1/2] Add driver callback for updating device info
> 
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> We need to update some fields of the device's cl_device_id struct at runtime
> using driver-specific methods. It is best to group all such updates into a single
> driver callback to avoid opening/initing and deiniting/closing the device multiple
> times.
> 
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  src/cl_device_id.c       | 20 ++------------------
>  src/cl_driver.h          |  4 ++++
>  src/cl_driver_defs.c     |  1 +
>  src/intel/intel_driver.c | 36 ++++++++++++++++++++++++++++++++++++
>  4 files changed, 43 insertions(+), 18 deletions(-)
> 
> diff --git a/src/cl_device_id.c b/src/cl_device_id.c index 4e01c9f..fefcef3 100644
> --- a/src/cl_device_id.c
> +++ b/src/cl_device_id.c
> @@ -506,24 +506,8 @@ skl_gt4_break:
>      ret->profile_sz = strlen(ret->profile) + 1;
>    }
> 
> -#ifdef HAS_USERPTR
> -  cl_driver dummy = cl_driver_new(NULL);
> -  cl_buffer_mgr bufmgr = cl_driver_get_bufmgr(dummy);
> -
> -  const size_t sz = 4096;
> -  void* host_ptr = cl_aligned_malloc(sz, 4096);;
> -  if (host_ptr != NULL) {
> -    cl_buffer bo = cl_buffer_alloc_userptr(bufmgr, "CL memory object",
> host_ptr, sz, 0);
> -    if (bo == NULL)
> -      ret->host_unified_memory = CL_FALSE;
> -    else
> -      cl_buffer_unreference(bo);
> -    cl_free(host_ptr);
> -  }
> -  else
> -    ret->host_unified_memory = CL_FALSE;
> -  cl_driver_delete(dummy);
> -#endif
> +  /* Apply any driver-dependent updates to the device info */
> + cl_driver_update_device_info(ret);
> 
>    struct sysinfo info;
>    if (sysinfo(&info) == 0) {
> diff --git a/src/cl_driver.h b/src/cl_driver.h index 16f8bba..3f54a27 100644
> --- a/src/cl_driver.h
> +++ b/src/cl_driver.h
> @@ -376,6 +376,10 @@ extern cl_buffer_get_tiling_align_cb
> *cl_buffer_get_tiling_align;  typedef int (cl_driver_get_device_id_cb)(void);
>  extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
> 
> +/* Update the device info */
> +typedef void (cl_driver_update_device_info_cb)(cl_device_id device);
> +extern cl_driver_update_device_info_cb *cl_driver_update_device_info;
> +
> 
> /***************************************************************
> ***********
>   * cl_khr_gl_sharing.
> 
> ****************************************************************
> **********/
> diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index 2b68539..9a47210
> 100644
> --- a/src/cl_driver_defs.c
> +++ b/src/cl_driver_defs.c
> @@ -26,6 +26,7 @@ LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
> LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;  LOCAL
> cl_driver_get_ver_cb *cl_driver_get_ver = NULL;  LOCAL
> cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
> +LOCAL cl_driver_update_device_info_cb *cl_driver_update_device_info =
> +NULL;
> 
>  /* Buffer */
>  LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL; diff --git
> a/src/intel/intel_driver.c b/src/intel/intel_driver.c index ff0cf27..d61988c
> 100644
> --- a/src/intel/intel_driver.c
> +++ b/src/intel/intel_driver.c
> @@ -754,6 +754,41 @@ static int intel_buffer_set_tiling(cl_buffer bo,
>    return ret;
>  }
> 
> +static void
> +intel_update_device_info(cl_device_id device) { #ifdef HAS_USERPTR
> +  intel_driver_t *driver;
> +  const size_t sz = 4096;
> +  void *host_ptr;
> +
> +  driver = intel_driver_new();
> +  assert(driver != NULL);
> +  if (intel_driver_open(driver, NULL) != CL_SUCCESS) {
> +    intel_driver_delete(driver);
> +    return;
> +  }
> +
> +  host_ptr = cl_aligned_malloc(sz, 4096);  if (host_ptr != NULL) {
> +    cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
> +      "CL memory object", host_ptr, sz, 0);
> +    if (bo == NULL)
> +      device->host_unified_memory = CL_FALSE;
> +    else
> +      drm_intel_bo_unreference((drm_intel_bo*)bo);
> +    cl_free(host_ptr);
> +  }
> +  else
> +    device->host_unified_memory = CL_FALSE;
> +
> +  intel_driver_context_destroy(driver);
> +  intel_driver_close(driver);
> +  intel_driver_terminate(driver);
> +  intel_driver_delete(driver);
> +#endif
> +}
> +
>  LOCAL void
>  intel_setup_callbacks(void)
>  {
> @@ -762,6 +797,7 @@ intel_setup_callbacks(void)
>    cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
>    cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *)
> intel_driver_get_bufmgr;
>    cl_driver_get_device_id = (cl_driver_get_device_id_cb *)
> intel_get_device_id;
> +  cl_driver_update_device_info = (cl_driver_update_device_info_cb *)
> + intel_update_device_info;
>    cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
>    cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*)
> intel_buffer_alloc_userptr;
>    cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
> --
> 2.3.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From zhigang.gong at intel.com  Sun Mar  8 17:58:21 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Mon,  9 Mar 2015 08:58:21 +0800
Subject: [Beignet] [PATCH] GBE: add a new incompatible compile option
	-cl-finite-math-only.
Message-ID: <1425862701-23792-1-git-send-email-zhigang.gong@intel.com>

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 06810bd..eee7c3c 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -676,7 +676,7 @@ namespace gbe {
       const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
                        "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
 
-      const std::string uncompatiblePCHOptions = ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1");
+      const std::string uncompatiblePCHOptions = ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1, -cl-finite-math-only");
       const std::string fastMathOption = ("-cl-fast-relaxed-math");
       while (end != std::string::npos) {
         end = optionStr.find(' ', start);
-- 
1.9.1


From zhigang.gong at linux.intel.com  Sun Mar  8 18:11:38 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 9 Mar 2015 09:11:38 +0800
Subject: [Beignet] [V2 PATCH 5/7] Backend: Handle the bswap using
 indirect mode access.
In-Reply-To: <1425626640-25840-1-git-send-email-junyan.he@inbox.com>
References: <1425626640-25840-1-git-send-email-junyan.he@inbox.com>
Message-ID: <20150309011138.GB20578@ivb-gt2-rev4>

On Fri, Mar 06, 2015 at 03:24:00PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> The swap for short will be like:
> mov(1)   a0<1>:UD        0xe600e61UD            { align1 WE_all };
> mov(1)   a0.1<1>:UD      0xe620e63UD            { align1 WE_all };
> mov(1)   a0.2<1>:UD      0xe640e65UD            { align1 WE_all };
> mov(1)   a0.3<1>:UD      0xe660e67UD            { align1 WE_all };
> mov(8)   g114<1>:UB      g[a0]<VxH,1,0>:UB      { align1 WE_all 1Q };
> mov(8)   g114.8<1>:UB    g[a0 8]<VxH,1,0>:UB    { align1 WE_all 1Q };
> mov(8)   g114.16<1>:UB   g[a0 16]<VxH,1,0>:UB   { align1 WE_all 1Q };
> mov(8)   g114.24<1>:UB   g[a0 24]<VxH,1,0>:UB   { align1 WE_all 1Q };
> mov(16)  g113<1>:UW      g114<8,8,1>:UW         { align1 WE_normal 1H };
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/backend/gen_context.cpp        |  112 ++++++++++++++++++++++++++++
>  backend/src/backend/gen_insn_selection.cpp |    9 +++
>  backend/src/backend/gen_insn_selection.hxx |    1 +
>  3 files changed, 122 insertions(+)
> 
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index 6856510..46b4a06 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -297,6 +297,118 @@ namespace gbe
>            p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
>          break;
>        }
> +      case SEL_OP_BSWAP: {
> +        uint32_t simd = p->curr.execWidth;
> +        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
> +        uint16_t new_a0[16];
> +        memset(new_a0, 0, sizeof(new_a0));
> +
> +        GBE_ASSERT(src.type == dst.type);
> +        uint32_t start_addr = src.nr*32 + src.subnr;
> +
> +        if (simd == 1) {
> +          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
> +              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
> +          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
> +            GBE_ASSERT(start_addr >= 0);
> +            new_a0[0] = start_addr + 3;
> +            new_a0[1] = start_addr + 2;
> +            new_a0[2] = start_addr + 1;
> +            new_a0[3] = start_addr;
> +            this->setA0Content(new_a0, 0, 4);
> +
> +            p->push();
> +            p->curr.execWidth = 4;
> +            p->curr.predicate = GEN_PREDICATE_NONE;
> +            p->curr.noMask = 1;
> +            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> +                a0[0], new_a0[0] - a0[0]);
> +            GenRegister dst_ = dst;
> +            dst_.type = GEN_TYPE_UB;
> +            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
> +            dst_.width = GEN_WIDTH_4;
> +            dst_.vstride = GEN_VERTICAL_STRIDE_4;
> +            p->MOV(dst_, ind_src);
> +            p->pop();
> +          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
> +            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
> +                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
> +            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
> +                GenRegister::retype(src, GEN_TYPE_UB));
> +          } else {
> +            GBE_ASSERT(0);
> +          }
> +        } else {
> +          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
> +            GBE_ASSERT(src.subnr == 0);
The above assertion is not correct. Because a valid simd8 or simd16 BSWAP instruction may have a
uniform source register. We can't assume the source register must not be uniform value.

> +            GBE_ASSERT(dst.subnr == 0);
> +            GBE_ASSERT(tmp.subnr == 0);
> +            GBE_ASSERT(start_addr >= 0);
> +            new_a0[0] = start_addr + 3;
> +            new_a0[1] = start_addr + 2;
> +            new_a0[2] = start_addr + 1;
> +            new_a0[3] = start_addr;
> +            new_a0[4] = start_addr + 7;
> +            new_a0[5] = start_addr + 6;
> +            new_a0[6] = start_addr + 5;
> +            new_a0[7] = start_addr + 4;
> +            this->setA0Content(new_a0, 56);
> +
> +            p->push();
> +            p->curr.execWidth = 8;
> +            p->curr.predicate = GEN_PREDICATE_NONE;
> +            p->curr.noMask = 1;
> +            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> +                a0[0], new_a0[0] - a0[0]);
> +            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
> +            for (int i = 1; i < 4; i++) {
> +              ind_src.addr_imm += 8;
> +              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
> +            }
> +            if (simd == 16) {
> +              for (int i = 0; i < 4; i++) {
> +                ind_src.addr_imm += 8;
> +                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
> +              }
> +            }
> +            p->pop();
> +
> +            p->MOV(dst, tmp);
> +          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
> +            GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
> +            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
> +            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
> +            GBE_ASSERT(start_addr >= 0);
> +            new_a0[0] = start_addr + 1;
> +            new_a0[1] = start_addr;
> +            new_a0[2] = start_addr + 3;
> +            new_a0[3] = start_addr + 2;
> +            new_a0[4] = start_addr + 5;
> +            new_a0[5] = start_addr + 4;
> +            new_a0[6] = start_addr + 7;
> +            new_a0[7] = start_addr + 6;
> +            this->setA0Content(new_a0, 56);
> +
> +            p->push();
> +            p->curr.execWidth = 8;
> +            p->curr.predicate = GEN_PREDICATE_NONE;
> +            p->curr.noMask = 1;
> +            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> +                a0[0], new_a0[0] - a0[0]);
> +            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
> +            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
> +              ind_src.addr_imm += 8;
> +              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
> +            }
> +            p->pop();
> +
> +            p->MOV(dst, tmp);
> +          } else {
> +            GBE_ASSERT(0);
> +          }
> +        }
> +      }
> +      break;
>        default:
>          NOT_IMPLEMENTED;
>      }
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index d100f80..2b166b1 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -498,6 +498,7 @@ namespace gbe
>      ALU1(RNDE)
>      ALU1(F16TO32)
>      ALU1(F32TO16)
> +    ALU1WithTemp(BSWAP)
>      ALU2(SEL)
>      ALU2(SEL_INT64)
>      ALU1(NOT)
> @@ -2121,6 +2122,14 @@ namespace gbe
>            case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
>            case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
>            case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
> +          case ir::OP_BSWAP:
> +            {
> +              ir::Register tmp = sel.reg(getFamily(insnType));
> +              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
> +              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
> +              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
> +              break;
> +            }
>            case ir::OP_SIMD_ANY:
>              {
>                const GenRegister constZero = GenRegister::immuw(0);;
> diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
> index be1f7ec..09f5aaf 100644
> --- a/backend/src/backend/gen_insn_selection.hxx
> +++ b/backend/src/backend/gen_insn_selection.hxx
> @@ -1,5 +1,6 @@
>  DECL_SELECTION_IR(LABEL, LabelInstruction)
>  DECL_SELECTION_IR(MOV, UnaryInstruction)
> +DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
>  DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
>  DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
>  DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From nanhai.zou at intel.com  Sun Mar  8 19:34:46 2015
From: nanhai.zou at intel.com (Zou, Nanhai)
Date: Mon, 9 Mar 2015 02:34:46 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150306083919.GH18784@nuc-i3427.alporthouse.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
Message-ID: <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>

We don't need MAP_FIXED, we just want to avoid address 0 to be allocated.

Though I think using MAP_FIXED is overkill, will bring much unnecessary complexity on both kernel and beignet side.
I don't mind if people can provide stable MAP_FIXED patches to resolve this problem a few months or years later.

At that time, kernel driver can revert the reserve page 0 patch.
Before that reserve page 0 can benefit all the Beignet user without breaking anything.

As I know, on CPU side, many arches with flexible address space like IA64 have reserved virtual page 0 to address this problem,
I don't see why this is non sense.

Thanks
Zou Nanhai


> -----Original Message-----
> From: Chris Wilson [mailto:chris at chris-wilson.co.uk]
> Sent: Friday, March 06, 2015 4:39 PM
> To: Zou, Nanhai
> Cc: Daniel Vetter; Song, Ruiling; Vetter, Daniel; intel-gfx at lists.freedesktop.org;
> Yang, Rong R; beignet at lists.freedesktop.org; Weinehall, David
> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address allocation
> 
> On Fri, Mar 06, 2015 at 02:11:18AM +0000, Zou, Nanhai wrote:
> > I don't understand why we need a complex solution when there is already a
> simple solution with patch.
> > What is the drawback of reserving page 0?
> > Before we going to that complex solution, could we just reserve page zero?
> > It is simple and straight forward.
> 
> Because it is a nonsense ABI constraint. If you want the equivalent of
> MAP_FIXED, we should give you MAP_FIXED.
> -Chris
> 
> --
> Chris Wilson, Intel Open Source Technology Centre

From xionghu.luo at intel.com  Sun Mar  8 20:24:25 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Mon,  9 Mar 2015 11:24:25 +0800
Subject: [Beignet] [PATCH 1/4] enable cl_khr_spir extension to build and run
	from SPIR binary.
Message-ID: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

the SPIR are built by clang generating a standard llvm Module file,
beignet need insert one byte before the module repesents binary type
then parse the module to link.
enable cl_khr_spir extension output string;
enable the SPIR calling conversion of CallingConv::SPIR_KERNEL;
get_global_id shoud be OVERLOADABLE; fix some bugs in prinf parse
and backend.

v2: move OVERLOADABLE change to another patch to keep clean;
rename FROM_INTERMEDIATE to FROM_LLVM_SPIR.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/backend/gen_program.cpp        |  5 +++++
 backend/src/libocl/tmpl/ocl_defines.tmpl.h |  1 +
 backend/src/llvm/llvm_gen_backend.cpp      |  5 ++++-
 backend/src/llvm/llvm_printf_parser.cpp    |  3 ++-
 backend/src/llvm/llvm_scalarize.cpp        |  1 +
 src/cl_api.c                               |  1 +
 src/cl_extensions.c                        |  4 ++++
 src/cl_program.c                           | 21 +++++++++++++++++++--
 src/cl_program.h                           |  3 ++-
 9 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 755c60e..f4c74f8 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -261,6 +261,11 @@ namespace gbe {
     acquireLLVMContextLock();
     llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
 #endif
+    // if load 32 bit spir binary, the triple should be spir-unknown-unknown.
+    llvm::Triple triple(module->getTargetTriple());
+    if(triple.getArchName() == "spir" && triple.getVendorName() == "unknown" && triple.getOSName() == "unknown"){
+      module->setTargetTriple("spir");
+    }
     releaseLLVMContextLock();
     if(module == NULL){
       GBE_ASSERT(0);
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
index 4e210be..fe999b2 100644
--- a/backend/src/libocl/tmpl/ocl_defines.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -34,5 +34,6 @@
 #define cl_khr_byte_addressable_store
 #define cl_khr_icd
 #define cl_khr_gl_sharing
+#define cl_khr_spir
 
 #endif /* end of __OCL_COMMON_DEF_H__ */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 6390551..2e03120 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1375,7 +1375,9 @@ namespace gbe
         llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
         llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
         llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
-        llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+        if(argNameNode){
+          llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+        }
 
         // function arguments are uniform values.
         this->newRegister(I, NULL, true);
@@ -2022,6 +2024,7 @@ namespace gbe
 #else
       case CallingConv::C:
       case CallingConv::Fast:
+      case CallingConv::SPIR_KERNEL:
 #endif
         break;
       default:
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 8e662b3..9632011 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -565,6 +565,7 @@ error:
 #else
       case CallingConv::C:
       case CallingConv::Fast:
+      case CallingConv::SPIR_KERNEL:
 #endif
         break;
       default:
@@ -595,7 +596,7 @@ error:
           continue;
         }
 
-        if (call->getCalledFunction()->getIntrinsicID() != 0)
+        if (call->getCalledFunction() && call->getCalledFunction()->getIntrinsicID() != 0)
           continue;
 
         Value *Callee = call->getCalledValue();
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 97a7615..15309de 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -789,6 +789,7 @@ namespace gbe {
 #else
     case CallingConv::C:
     case CallingConv::Fast:
+    case CallingConv::SPIR_KERNEL:
 #endif
       break;
     default:
diff --git a/src/cl_api.c b/src/cl_api.c
index 972c687..3e72deb 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -941,6 +941,7 @@ clBuildProgram(cl_program            program,
   /* TODO support create program from binary */
   assert(program->source_type == FROM_LLVM ||
          program->source_type == FROM_SOURCE ||
+         program->source_type == FROM_LLVM_SPIR ||
          program->source_type == FROM_BINARY);
   if((err = cl_program_build(program, options)) != CL_SUCCESS) {
     goto error;
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index d07a525..cea2dd8 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -34,8 +34,12 @@ void check_opt1_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)
+  {
     if (id == EXT_ID(khr_icd))
       extensions->extensions[id].base.ext_enabled = 1;
+    if (id == EXT_ID(khr_spir))
+      extensions->extensions[id].base.ext_enabled = 1;
+  }
 }
 
 void
diff --git a/src/cl_program.c b/src/cl_program.c
index c30f85e..db53757 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -231,7 +231,21 @@ cl_program_create_from_binary(cl_context             ctx,
   program->binary_sz = lengths[0];
   program->source_type = FROM_BINARY;
 
-  if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
+  if(isBitcode((unsigned char*)program->binary, (unsigned char*)program->binary+program->binary_sz)) {
+
+    char* typed_binary;
+    TRY_ALLOC(typed_binary, cl_calloc(lengths[0]+1, sizeof(char)));
+    memcpy(typed_binary+1, binaries[0], lengths[0]);
+    *typed_binary = 1;
+    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->vendor_id, typed_binary, program->binary_sz+1);
+    cl_free(typed_binary);
+    if (UNLIKELY(program->opaque == NULL)) {
+      err = CL_INVALID_PROGRAM;
+      goto error;
+    }
+
+    program->source_type = FROM_LLVM_SPIR;
+  }else if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
     if(*program->binary == 1){
       program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
     }else if(*program->binary == 2){
@@ -499,6 +513,9 @@ cl_program_build(cl_program p, const char *options)
       memcpy(p->build_opts, options, strlen(options));
 
       p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+      if (strstr(options, "-x spir")) {
+        p->source_type = FROM_LLVM_SPIR;
+      }
     }
   }
 
@@ -526,7 +543,7 @@ cl_program_build(cl_program p, const char *options)
 
     /* Create all the kernels */
     TRY (cl_program_load_gen_program, p);
-  } else if (p->source_type == FROM_LLVM) {
+  } else if (p->source_type == FROM_LLVM || p->source_type == FROM_LLVM_SPIR) {
     if (!CompilerSupported()) {
       err = CL_COMPILER_NOT_AVAILABLE;
       goto error;
diff --git a/src/cl_program.h b/src/cl_program.h
index 3ab7acd..7af0206 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -33,7 +33,8 @@ struct _gbe_program;
 enum {
   FROM_SOURCE = 0,
   FROM_LLVM = 1,
-  FROM_BINARY = 2
+  FROM_BINARY = 2,
+  FROM_LLVM_SPIR = 3
 };
 
 /* This maps an OCL file containing some kernels */
-- 
1.9.1


From xionghu.luo at intel.com  Sun Mar  8 20:24:26 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Mon,  9 Mar 2015 11:24:26 +0800
Subject: [Beignet] [PATCH 2/4] change the workitem related api to
	OVERLOABABLE.
In-Reply-To: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
References: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425871468-11096-2-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

the SPIR header file requirs these functions to be overlable.
(https://github.com/KhronosGroup/SPIR-Tools/blob/master/headers/opencl_spir.h)

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/include/ocl_async.h    |  2 +-
 backend/src/libocl/include/ocl_sync.h     |  2 +-
 backend/src/libocl/include/ocl_types.h    |  2 --
 backend/src/libocl/include/ocl_workitem.h | 16 ++++++++--------
 backend/src/libocl/src/ocl_async.cl       |  2 +-
 backend/src/libocl/src/ocl_barrier.ll     |  2 +-
 backend/src/libocl/src/ocl_workitem.cl    |  6 +++---
 kernels/compiler_async_copy.cl            |  4 ++--
 8 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/backend/src/libocl/include/ocl_async.h b/backend/src/libocl/include/ocl_async.h
index dd89942..9d5cc06 100644
--- a/backend/src/libocl/include/ocl_async.h
+++ b/backend/src/libocl/include/ocl_async.h
@@ -45,7 +45,7 @@ DEF(double)
 #undef DEFN
 #undef DEF
 
-void wait_group_events (int num_events, event_t *event_list);
+OVERLOADABLE void wait_group_events (int num_events, event_t *event_list);
 
 #define DEFN(TYPE) \
 OVERLOADABLE void prefetch(const global TYPE *p, size_t num);
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
index ed7c6e4..18090d5 100644
--- a/backend/src/libocl/include/ocl_sync.h
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -27,7 +27,7 @@
 #define CLK_GLOBAL_MEM_FENCE (1 << 1)
 
 typedef uint cl_mem_fence_flags;
-void barrier(cl_mem_fence_flags flags);
+OVERLOADABLE void barrier(cl_mem_fence_flags flags);
 void mem_fence(cl_mem_fence_flags flags);
 void read_mem_fence(cl_mem_fence_flags flags);
 void write_mem_fence(cl_mem_fence_flags flags);
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
index 487fe68..ae0236b 100644
--- a/backend/src/libocl/include/ocl_types.h
+++ b/backend/src/libocl/include/ocl_types.h
@@ -87,7 +87,5 @@ DEF(double);
 // FIXME:
 // This is a transitional hack to bypass the LLVM 3.3 built-in types.
 // See the Khronos SPIR specification for handling of these types.
-typedef size_t __event_t;
-#define event_t __event_t
 
 #endif /* __OCL_TYPES_H__ */
diff --git a/backend/src/libocl/include/ocl_workitem.h b/backend/src/libocl/include/ocl_workitem.h
index 7534ee8..84bb1fb 100644
--- a/backend/src/libocl/include/ocl_workitem.h
+++ b/backend/src/libocl/include/ocl_workitem.h
@@ -20,13 +20,13 @@
 
 #include "ocl_types.h"
 
-uint get_work_dim(void);
-uint get_global_size(uint dimindx);
-uint get_global_id(uint dimindx);
-uint get_local_size(uint dimindx);
-uint get_local_id(uint dimindx);
-uint get_num_groups(uint dimindx);
-uint get_group_id(uint dimindx);
-uint get_global_offset(uint dimindx);
+OVERLOADABLE uint get_work_dim(void);
+OVERLOADABLE uint get_global_size(uint dimindx);
+OVERLOADABLE uint get_global_id(uint dimindx);
+OVERLOADABLE uint get_local_size(uint dimindx);
+OVERLOADABLE uint get_local_id(uint dimindx);
+OVERLOADABLE uint get_num_groups(uint dimindx);
+OVERLOADABLE uint get_group_id(uint dimindx);
+OVERLOADABLE uint get_global_offset(uint dimindx);
 
 #endif  /* __OCL_WORKITEM_H__ */
diff --git a/backend/src/libocl/src/ocl_async.cl b/backend/src/libocl/src/ocl_async.cl
index 041aaf2..10d0aa4 100644
--- a/backend/src/libocl/src/ocl_async.cl
+++ b/backend/src/libocl/src/ocl_async.cl
@@ -66,7 +66,7 @@ DEF(double)
 #undef DEFN
 #undef DEF
 
-void wait_group_events (int num_events, event_t *event_list) {
+OVERLOADABLE void wait_group_events (int num_events, event_t *event_list) {
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
 }
 
diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
index 4e55fcb..dc3579c 100644
--- a/backend/src/libocl/src/ocl_barrier.ll
+++ b/backend/src/libocl/src/ocl_barrier.ll
@@ -10,7 +10,7 @@ declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
 declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
 
-define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
+define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
   %1 = icmp eq i32 %flags, 3
   br i1 %1, label %barrier_local_global, label %barrier_local_check
 
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index f4629f8..6ddc406 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -18,7 +18,7 @@
 #include "ocl_workitem.h"
 
 PURE CONST uint __gen_ocl_get_work_dim(void);
-uint get_work_dim(void)
+OVERLOADABLE uint get_work_dim(void)
 {
   return __gen_ocl_get_work_dim();
 }
@@ -37,7 +37,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #undef DECL_INTERNAL_WORK_ITEM_FN
 
 #define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
-unsigned NAME(unsigned int dim) {             \
+OVERLOADABLE unsigned NAME(unsigned int dim) {             \
   if (dim == 0) return __gen_ocl_##NAME##0();        \
   else if (dim == 1) return __gen_ocl_##NAME##1();   \
   else if (dim == 2) return __gen_ocl_##NAME##2();   \
@@ -52,6 +52,6 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
-uint get_global_id(uint dim) {
+OVERLOADABLE uint get_global_id(uint dim) {
   return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
 }
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
index dddde44..4beb436 100644
--- a/kernels/compiler_async_copy.cl
+++ b/kernels/compiler_async_copy.cl
@@ -5,10 +5,10 @@ compiler_async_copy_##TYPE(__global TYPE *dst, __global TYPE *src, __local TYPE
   event_t event; \
   int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); \
   int i; \
-  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 ); \
   wait_group_events( 1, &event ); \
 \
-  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, 0 ); \
   wait_group_events( 1, &event ); \
 }
 
-- 
1.9.1


From xionghu.luo at intel.com  Sun Mar  8 20:24:27 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Mon,  9 Mar 2015 11:24:27 +0800
Subject: [Beignet] [PATCH 3/4] SPIR binary support for printf function.
In-Reply-To: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
References: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425871468-11096-3-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

rename "printf" to "__gen_ocl_printf_stub" and "puts" to
"__gen_ocl_puts_stub" in PrintfParser after link.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/include/ocl_printf.h |  3 +++
 backend/src/llvm/llvm_printf_parser.cpp | 12 ++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/backend/src/libocl/include/ocl_printf.h b/backend/src/libocl/include/ocl_printf.h
index ffeefb9..27cef27 100644
--- a/backend/src/libocl/include/ocl_printf.h
+++ b/backend/src/libocl/include/ocl_printf.h
@@ -24,9 +24,12 @@
 /* From LLVM 3.4, c string are all in constant address space */
 #if 100*__clang_major__ + __clang_minor__ < 304
 int __gen_ocl_printf_stub(const char * format, ...);
+int __gen_ocl_puts_stub(const char * format);
 #else
 int __gen_ocl_printf_stub(constant char * format, ...);
+int __gen_ocl_puts_stub(constant char * format);
 #endif
 #define printf __gen_ocl_printf_stub
+#define puts __gen_ocl_puts_stub
 
 #endif
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 9632011..7800c01 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -552,7 +552,6 @@ error:
     return true;
   }
 
-
   bool PrintfParser::runOnFunction(llvm::Function &F)
   {
     bool changed = false;
@@ -583,6 +582,15 @@ error:
 
     builder = new IRBuilder<>(module->getContext());
 
+    llvm::GlobalValue* gFun = module->getNamedValue("printf");
+    if(gFun) {
+      gFun->setName("__gen_ocl_printf_stub");
+    }
+    llvm::GlobalValue* gFun2 = module->getNamedValue("puts");
+    if(gFun2 ) {
+      gFun2->setName("__gen_ocl_puts_stub");
+    }
+
     /* First find printfs and caculate all slots size of one loop. */
     for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
@@ -602,7 +610,7 @@ error:
         Value *Callee = call->getCalledValue();
         const std::string fnName = Callee->getName();
 
-        if (fnName != "__gen_ocl_printf_stub")
+        if (fnName != "__gen_ocl_printf_stub" && fnName != "__gen_ocl_puts_stub")
           continue;
 
         if (!parseOnePrintfInstruction(call, pInfo, sizeof_size)) {
-- 
1.9.1


From xionghu.luo at intel.com  Sun Mar  8 20:24:28 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Mon,  9 Mar 2015 11:24:28 +0800
Subject: [Beignet] [PATCH 4/4] add utest for load spir binary.
In-Reply-To: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
References: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425871468-11096-4-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

To generate SPIR binary, please refer to the page
https://github.com/KhronosGroup/SPIR.
For llvm3.2, the command is "clang -cc1 -emit-llvm-bc -triple
spir-unknown-unknown -cl-std=CL1.2 -include opencl_spir.h
compiler_ceil.cl -o compiler_ceil32.spir"
For llvm3.5, the option -cl-kernel-arg-info is required,
and option -fno-builtin is required to avoid warning.

v2: add missing load_program_from_spir.cpp file.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 kernels/compiler_ceil32.spir      | Bin 0 -> 1732 bytes
 utests/CMakeLists.txt             |   1 +
 utests/load_program_from_spir.cpp |  90 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 kernels/compiler_ceil32.spir
 create mode 100644 utests/load_program_from_spir.cpp

diff --git a/kernels/compiler_ceil32.spir b/kernels/compiler_ceil32.spir
new file mode 100644
index 0000000000000000000000000000000000000000..ee6483476568e76f3765622cf8fd8a84a81e5d2a
GIT binary patch
literal 1732
zcmY*Ze at qis9Dmo=dzF at VhjdcL^}GWl{DJdCN2oy7a_Dd|OF{o|Kav$Q4cW$a6f6{z
z?Mkc199ya(3rm)zm*_$kvoJ#zvY6NcGX#k?E-{W6(*~$N=Aa6iZP~Byq0{V=d*65O
zec$Ik?|na?*C;1$igtv)r4X|93&?~J+5j&Om!0%ZTlUawwfhtmwQpW$E#@ts98}p|
zrp8mY#wc6usI at xx<W)D7H|NE~vZ+?pUMvR6 at wBzz;7EMLb1?^ysKtrU5X6!}0d0)W
zd90?<_oPoV|G<MirPp;X+s4l!enIChimt5A%_X?B?wSa5sLP6j!&(4YdwR7R35Tp&
zuu`j0wVr0&6W0PUt)>zOo3$XC6}jte{0t at B$hhVN*Nr5X)$?a5el89;e2S7Z#jk}b
zv8rf2?dBdU?in@!d)p&HEAI2)1UC^jeyt{5H}!mA0-ski+zmZH8|Ek4xNDT~q@(Aj
zz5LBEKhwtD&bSuz{3)HEO>#3{e$LNd^=_J>5 at vP8z*i(K&~ENhwVHNKiR12PB<})M
zK+d>k!#r&Edd496bh9;a`o3{CW`S}f$)d!pr0AT$EK1~##4M)hEMk6T$pRXdYey&R
zhs>Vw$-1$yds?m?lig!f{dsxkxVd2{Qa|Q(kIFm8%zlzdkQE<68M2(B at 1#jqVmkaJ
zm!bh at RVAuBYOb4=-6&djUM?AzYsV++0Wcw#jLEer^G=jy$vyZrvM=RG2QY!p7`c at u
z9TKz3lKD}Bwms=|Am%}itoq2jhglYw4uL%MldO+CNYN`X@&|C>QBwopM<ebbxn$T3
zB<rH)hWIm*Eeucv<Yhluh0P|&GLU?_+0kU>-*7e`p>ox<2XtiyT@@bpI4dku(Kz%!
z77$_}R+n1W3vVB at 6qcTjLNg!LV~!1b2L^4?=w4PTL)%fr>h;mA{zVyzl<q(IoY=1j
z{?CvD))tP0RIMAt1!X*dL3&4(|Ml~K0D_3=lW-u8y9C at 3BS8=D^W*Lm{fNcEIPUc0
z4nGrsLCAr at r<s1HTm3IHww5X2-nCUB0rv^`$m4}RV=4eGO!;spOs%zm*@>_L%VI^q
z-O_0hAuBM3=lBlHt~nQ>M67}`d5l4XrcD&0Ay)Il_TqxQvBC8fFJH9C-)xAxC}TbZ
zXZS8e*=h2b4*BexTZYBsr4D&p)yOrSTS&TQ!mbZRZXv@#I~|5jQI;|sG1E4nv#;n5
z+u0+PIHYI+zxI|DhiakKI-OczSPT7jtarrl-8#@!q;6=EmEDde at 5cIHKo4oMm>?He
z@<1UMJk0NiSsYDBIM{1A<wfVg$`j`*FPYwPfvs-nV348-FMl~xbX(k%)`1Cs*YG<f
zP5GYxvE{3ES9XHh$7T2E<ko1U{>#a&pkeSrl{4$*FSaE;WW~eCmd&px==lW6q{%G=
z$4VYZ%$*!rLUdP}K9Z&tM7MjGj6zm&^f_PNSzn$m+0Nu_T|TVj=xmHE8M3Ek`=Qpa
zj{(D>)-6;06Hm;0+<D^6o3`)LAuXtA-Ecv{?WEv76zTM6N=y^``?m}n7F}}`57%A7
z at 3`rAT=VBA)QwG|qp>ZzLUKN`B$0b*`sv}UNSo2_%6kd&C`I0nkp)QbF#AkUZlh*J
zP_yk;m7B_NhaFVIfLt4$9Bfo^s26t)ciJD{5#a6Z_C|gp>AI|QhoyCEX_1mybHlXU
zFlcU=?H%k^@KGpRv!(!D#cUKnz<}K;Ae0BOLj8053+#030d at 7t!w&YR52{{8RWDVS
z(<v)i5o`s2nZi|&hS~&#To4N_-YOQNA+G&p2$j)OESKyx75Jpx_5rcE|Gp*S9JEHM
zgd*4==W`XcSp3t<Ye`{Of3ZEX<L5>ZU25Ir@$T3cEtXA91*$lCsKf+=CTiPbzrR6?
PvG47>gf#&JhVlFhihNjp

literal 0
HcmV?d00001

diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index eaba27d..06baa68 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -192,6 +192,7 @@ set (utests_sources
   compiler_time_stamp.cpp
   compiler_double_precision.cpp
   load_program_from_gen_bin.cpp
+  load_program_from_spir.cpp
   get_arg_info.cpp
   profiling_exec.cpp
   enqueue_copy_buf.cpp
diff --git a/utests/load_program_from_spir.cpp b/utests/load_program_from_spir.cpp
new file mode 100644
index 0000000..3e4534c
--- /dev/null
+++ b/utests/load_program_from_spir.cpp
@@ -0,0 +1,90 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+    dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_spir(void)
+{
+    size_t param_value_size;
+    std::string extensionStr;
+    OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_EXTENSIONS, 0, 0, &param_value_size);
+    std::vector<char> param_value(param_value_size);
+    OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size, param_value.empty() ? NULL : &param_value.front(), &param_value_size);
+    if (!param_value.empty())
+      extensionStr = std::string(&param_value.front(), param_value_size-1);
+
+    if (!std::strstr(extensionStr.c_str(), "cl_khr_spir")) {
+      OCL_ASSERT(0);
+    }
+
+    const size_t n = 16;
+    float cpu_dst[16], cpu_src[16];
+    cl_int status;
+    cl_int binary_status;
+    char *ker_path = NULL;
+
+    cl_file_map_t *fm = cl_file_map_new();
+    ker_path = cl_do_kiss_path("compiler_ceil32.spir", device);
+    OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+    const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
+    const size_t sz = cl_file_map_size(fm);
+
+    program = clCreateProgramWithBinary(ctx, 1,
+              &device, &sz, &src, &binary_status, &status);
+
+    OCL_ASSERT(program && status == CL_SUCCESS);
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(program, 1, &device, "-x spir", NULL, NULL) == CL_SUCCESS);
+
+    kernel = clCreateKernel(program, "compiler_ceil", &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+        OCL_UNMAP_BUFFER(0);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+#if 0
+        printf("#### GPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", ((float *)buf_data[1])[i]);
+        printf("\n#### CPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", cpu_dst[i]);
+        printf("\n");
+#endif
+
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+        OCL_UNMAP_BUFFER(1);
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_spir);
-- 
1.9.1


From Junyan.he at inbox.com  Sun Mar  8 22:55:06 2015
From: Junyan.he at inbox.com (He Junyan)
Date: Mon, 09 Mar 2015 13:55:06 +0800
Subject: [Beignet] [V2 PATCH 5/7] Backend: Handle the bswap using
 indirect mode access.
In-Reply-To: <20150309011138.GB20578@ivb-gt2-rev4>
References: <1425626640-25840-1-git-send-email-junyan.he@inbox.com>
 <20150309011138.GB20578@ivb-gt2-rev4>
Message-ID: <54FD35BA.3070108@inbox.com>


On 2015年03月09日 09:11, Zhigang Gong wrote:
> On Fri, Mar 06, 2015 at 03:24:00PM +0800, junyan.he at inbox.com wrote:
>> From: Junyan He <junyan.he at linux.intel.com>
>>
>> The swap for short will be like:
>> mov(1)   a0<1>:UD        0xe600e61UD            { align1 WE_all };
>> mov(1)   a0.1<1>:UD      0xe620e63UD            { align1 WE_all };
>> mov(1)   a0.2<1>:UD      0xe640e65UD            { align1 WE_all };
>> mov(1)   a0.3<1>:UD      0xe660e67UD            { align1 WE_all };
>> mov(8)   g114<1>:UB      g[a0]<VxH,1,0>:UB      { align1 WE_all 1Q };
>> mov(8)   g114.8<1>:UB    g[a0 8]<VxH,1,0>:UB    { align1 WE_all 1Q };
>> mov(8)   g114.16<1>:UB   g[a0 16]<VxH,1,0>:UB   { align1 WE_all 1Q };
>> mov(8)   g114.24<1>:UB   g[a0 24]<VxH,1,0>:UB   { align1 WE_all 1Q };
>> mov(16)  g113<1>:UW      g114<8,8,1>:UW         { align1 WE_normal 1H };
>>
>> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
>> ---
>>   backend/src/backend/gen_context.cpp        |  112 ++++++++++++++++++++++++++++
>>   backend/src/backend/gen_insn_selection.cpp |    9 +++
>>   backend/src/backend/gen_insn_selection.hxx |    1 +
>>   3 files changed, 122 insertions(+)
>>
>> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
>> index 6856510..46b4a06 100644
>> --- a/backend/src/backend/gen_context.cpp
>> +++ b/backend/src/backend/gen_context.cpp
>> @@ -297,6 +297,118 @@ namespace gbe
>>             p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
>>           break;
>>         }
>> +      case SEL_OP_BSWAP: {
>> +        uint32_t simd = p->curr.execWidth;
>> +        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
>> +        uint16_t new_a0[16];
>> +        memset(new_a0, 0, sizeof(new_a0));
>> +
>> +        GBE_ASSERT(src.type == dst.type);
>> +        uint32_t start_addr = src.nr*32 + src.subnr;
>> +
>> +        if (simd == 1) {
>> +          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
>> +              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
>> +          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
>> +            GBE_ASSERT(start_addr >= 0);
>> +            new_a0[0] = start_addr + 3;
>> +            new_a0[1] = start_addr + 2;
>> +            new_a0[2] = start_addr + 1;
>> +            new_a0[3] = start_addr;
>> +            this->setA0Content(new_a0, 0, 4);
>> +
>> +            p->push();
>> +            p->curr.execWidth = 4;
>> +            p->curr.predicate = GEN_PREDICATE_NONE;
>> +            p->curr.noMask = 1;
>> +            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
>> +                a0[0], new_a0[0] - a0[0]);
>> +            GenRegister dst_ = dst;
>> +            dst_.type = GEN_TYPE_UB;
>> +            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
>> +            dst_.width = GEN_WIDTH_4;
>> +            dst_.vstride = GEN_VERTICAL_STRIDE_4;
>> +            p->MOV(dst_, ind_src);
>> +            p->pop();
>> +          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
>> +            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
>> +                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
>> +            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
>> +                GenRegister::retype(src, GEN_TYPE_UB));
>> +          } else {
>> +            GBE_ASSERT(0);
>> +          }
>> +        } else {
>> +          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
>> +            GBE_ASSERT(src.subnr == 0);
> The above assertion is not correct. Because a valid simd8 or simd16 BSWAP instruction may have a
> uniform source register. We can't assume the source register must not be uniform value.
I think the uniform case will be handled in  if (simd == 1)  case just 
above.
I find if src is uniform, the dst seems always to be uniform and the 
simd will be 1 here.
>> +            GBE_ASSERT(dst.subnr == 0);
>> +            GBE_ASSERT(tmp.subnr == 0);
>> +            GBE_ASSERT(start_addr >= 0);
>> +            new_a0[0] = start_addr + 3;
>> +            new_a0[1] = start_addr + 2;
>> +            new_a0[2] = start_addr + 1;
>> +            new_a0[3] = start_addr;
>> +            new_a0[4] = start_addr + 7;
>> +            new_a0[5] = start_addr + 6;
>> +            new_a0[6] = start_addr + 5;
>> +            new_a0[7] = start_addr + 4;
>> +            this->setA0Content(new_a0, 56);
>> +
>> +            p->push();
>> +            p->curr.execWidth = 8;
>> +            p->curr.predicate = GEN_PREDICATE_NONE;
>> +            p->curr.noMask = 1;
>> +            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
>> +                a0[0], new_a0[0] - a0[0]);
>> +            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
>> +            for (int i = 1; i < 4; i++) {
>> +              ind_src.addr_imm += 8;
>> +              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
>> +            }
>> +            if (simd == 16) {
>> +              for (int i = 0; i < 4; i++) {
>> +                ind_src.addr_imm += 8;
>> +                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
>> +              }
>> +            }
>> +            p->pop();
>> +
>> +            p->MOV(dst, tmp);
>> +          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
>> +            GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
>> +            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
>> +            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
>> +            GBE_ASSERT(start_addr >= 0);
>> +            new_a0[0] = start_addr + 1;
>> +            new_a0[1] = start_addr;
>> +            new_a0[2] = start_addr + 3;
>> +            new_a0[3] = start_addr + 2;
>> +            new_a0[4] = start_addr + 5;
>> +            new_a0[5] = start_addr + 4;
>> +            new_a0[6] = start_addr + 7;
>> +            new_a0[7] = start_addr + 6;
>> +            this->setA0Content(new_a0, 56);
>> +
>> +            p->push();
>> +            p->curr.execWidth = 8;
>> +            p->curr.predicate = GEN_PREDICATE_NONE;
>> +            p->curr.noMask = 1;
>> +            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
>> +                a0[0], new_a0[0] - a0[0]);
>> +            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
>> +            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
>> +              ind_src.addr_imm += 8;
>> +              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
>> +            }
>> +            p->pop();
>> +
>> +            p->MOV(dst, tmp);
>> +          } else {
>> +            GBE_ASSERT(0);
>> +          }
>> +        }
>> +      }
>> +      break;
>>         default:
>>           NOT_IMPLEMENTED;
>>       }
>> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
>> index d100f80..2b166b1 100644
>> --- a/backend/src/backend/gen_insn_selection.cpp
>> +++ b/backend/src/backend/gen_insn_selection.cpp
>> @@ -498,6 +498,7 @@ namespace gbe
>>       ALU1(RNDE)
>>       ALU1(F16TO32)
>>       ALU1(F32TO16)
>> +    ALU1WithTemp(BSWAP)
>>       ALU2(SEL)
>>       ALU2(SEL_INT64)
>>       ALU1(NOT)
>> @@ -2121,6 +2122,14 @@ namespace gbe
>>             case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
>>             case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
>>             case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
>> +          case ir::OP_BSWAP:
>> +            {
>> +              ir::Register tmp = sel.reg(getFamily(insnType));
>> +              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
>> +              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
>> +              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
>> +              break;
>> +            }
>>             case ir::OP_SIMD_ANY:
>>               {
>>                 const GenRegister constZero = GenRegister::immuw(0);;
>> diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
>> index be1f7ec..09f5aaf 100644
>> --- a/backend/src/backend/gen_insn_selection.hxx
>> +++ b/backend/src/backend/gen_insn_selection.hxx
>> @@ -1,5 +1,6 @@
>>   DECL_SELECTION_IR(LABEL, LabelInstruction)
>>   DECL_SELECTION_IR(MOV, UnaryInstruction)
>> +DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
>>   DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
>>   DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
>>   DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
>> -- 
>> 1.7.9.5
>>
>> _______________________________________________
>> Beignet mailing list
>> Beignet at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From zhigang.gong at linux.intel.com  Sun Mar  8 21:57:41 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 9 Mar 2015 12:57:41 +0800
Subject: [Beignet] [V2 PATCH 5/7] Backend: Handle the bswap using
 indirect mode access.
In-Reply-To: <54FD35BA.3070108@inbox.com>
References: <1425626640-25840-1-git-send-email-junyan.he@inbox.com>
 <20150309011138.GB20578@ivb-gt2-rev4> <54FD35BA.3070108@inbox.com>
Message-ID: <20150309045740.GC20578@ivb-gt2-rev4>

On Mon, Mar 09, 2015 at 01:55:06PM +0800, He Junyan wrote:
> 
> On 2015年03月09日 09:11, Zhigang Gong wrote:
> >On Fri, Mar 06, 2015 at 03:24:00PM +0800, junyan.he at inbox.com wrote:
> >>From: Junyan He <junyan.he at linux.intel.com>
> >>
> >>The swap for short will be like:
> >>mov(1)   a0<1>:UD        0xe600e61UD            { align1 WE_all };
> >>mov(1)   a0.1<1>:UD      0xe620e63UD            { align1 WE_all };
> >>mov(1)   a0.2<1>:UD      0xe640e65UD            { align1 WE_all };
> >>mov(1)   a0.3<1>:UD      0xe660e67UD            { align1 WE_all };
> >>mov(8)   g114<1>:UB      g[a0]<VxH,1,0>:UB      { align1 WE_all 1Q };
> >>mov(8)   g114.8<1>:UB    g[a0 8]<VxH,1,0>:UB    { align1 WE_all 1Q };
> >>mov(8)   g114.16<1>:UB   g[a0 16]<VxH,1,0>:UB   { align1 WE_all 1Q };
> >>mov(8)   g114.24<1>:UB   g[a0 24]<VxH,1,0>:UB   { align1 WE_all 1Q };
> >>mov(16)  g113<1>:UW      g114<8,8,1>:UW         { align1 WE_normal 1H };
> >>
> >>Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> >>---
> >>  backend/src/backend/gen_context.cpp        |  112 ++++++++++++++++++++++++++++
> >>  backend/src/backend/gen_insn_selection.cpp |    9 +++
> >>  backend/src/backend/gen_insn_selection.hxx |    1 +
> >>  3 files changed, 122 insertions(+)
> >>
> >>diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> >>index 6856510..46b4a06 100644
> >>--- a/backend/src/backend/gen_context.cpp
> >>+++ b/backend/src/backend/gen_context.cpp
> >>@@ -297,6 +297,118 @@ namespace gbe
> >>            p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
> >>          break;
> >>        }
> >>+      case SEL_OP_BSWAP: {
> >>+        uint32_t simd = p->curr.execWidth;
> >>+        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
> >>+        uint16_t new_a0[16];
> >>+        memset(new_a0, 0, sizeof(new_a0));
> >>+
> >>+        GBE_ASSERT(src.type == dst.type);
> >>+        uint32_t start_addr = src.nr*32 + src.subnr;
> >>+
> >>+        if (simd == 1) {
> >>+          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
> >>+              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
> >>+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
> >>+            GBE_ASSERT(start_addr >= 0);
> >>+            new_a0[0] = start_addr + 3;
> >>+            new_a0[1] = start_addr + 2;
> >>+            new_a0[2] = start_addr + 1;
> >>+            new_a0[3] = start_addr;
> >>+            this->setA0Content(new_a0, 0, 4);
> >>+
> >>+            p->push();
> >>+            p->curr.execWidth = 4;
> >>+            p->curr.predicate = GEN_PREDICATE_NONE;
> >>+            p->curr.noMask = 1;
> >>+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> >>+                a0[0], new_a0[0] - a0[0]);
> >>+            GenRegister dst_ = dst;
> >>+            dst_.type = GEN_TYPE_UB;
> >>+            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
> >>+            dst_.width = GEN_WIDTH_4;
> >>+            dst_.vstride = GEN_VERTICAL_STRIDE_4;
> >>+            p->MOV(dst_, ind_src);
> >>+            p->pop();
> >>+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
> >>+            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
> >>+                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
> >>+            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
> >>+                GenRegister::retype(src, GEN_TYPE_UB));
> >>+          } else {
> >>+            GBE_ASSERT(0);
> >>+          }
> >>+        } else {
> >>+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
> >>+            GBE_ASSERT(src.subnr == 0);
> >The above assertion is not correct. Because a valid simd8 or simd16 BSWAP instruction may have a
> >uniform source register. We can't assume the source register must not be uniform value.
> I think the uniform case will be handled in  if (simd == 1)  case
> just above.
> I find if src is uniform, the dst seems always to be uniform and the
> simd will be 1 here.
This is not ture. If the src is uniform but the dst is defined in multiple places due
to phi instruction, the dst will not be identified as a uniform value.

> >>+            GBE_ASSERT(dst.subnr == 0);
> >>+            GBE_ASSERT(tmp.subnr == 0);
> >>+            GBE_ASSERT(start_addr >= 0);
> >>+            new_a0[0] = start_addr + 3;
> >>+            new_a0[1] = start_addr + 2;
> >>+            new_a0[2] = start_addr + 1;
> >>+            new_a0[3] = start_addr;
> >>+            new_a0[4] = start_addr + 7;
> >>+            new_a0[5] = start_addr + 6;
> >>+            new_a0[6] = start_addr + 5;
> >>+            new_a0[7] = start_addr + 4;
> >>+            this->setA0Content(new_a0, 56);
> >>+
> >>+            p->push();
> >>+            p->curr.execWidth = 8;
> >>+            p->curr.predicate = GEN_PREDICATE_NONE;
> >>+            p->curr.noMask = 1;
> >>+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> >>+                a0[0], new_a0[0] - a0[0]);
> >>+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
> >>+            for (int i = 1; i < 4; i++) {
> >>+              ind_src.addr_imm += 8;
> >>+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
> >>+            }
> >>+            if (simd == 16) {
> >>+              for (int i = 0; i < 4; i++) {
> >>+                ind_src.addr_imm += 8;
> >>+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
> >>+              }
> >>+            }
> >>+            p->pop();
> >>+
> >>+            p->MOV(dst, tmp);
> >>+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
> >>+            GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
> >>+            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
> >>+            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
> >>+            GBE_ASSERT(start_addr >= 0);
> >>+            new_a0[0] = start_addr + 1;
> >>+            new_a0[1] = start_addr;
> >>+            new_a0[2] = start_addr + 3;
> >>+            new_a0[3] = start_addr + 2;
> >>+            new_a0[4] = start_addr + 5;
> >>+            new_a0[5] = start_addr + 4;
> >>+            new_a0[6] = start_addr + 7;
> >>+            new_a0[7] = start_addr + 6;
> >>+            this->setA0Content(new_a0, 56);
> >>+
> >>+            p->push();
> >>+            p->curr.execWidth = 8;
> >>+            p->curr.predicate = GEN_PREDICATE_NONE;
> >>+            p->curr.noMask = 1;
> >>+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
> >>+                a0[0], new_a0[0] - a0[0]);
> >>+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
> >>+            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
> >>+              ind_src.addr_imm += 8;
> >>+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
> >>+            }
> >>+            p->pop();
> >>+
> >>+            p->MOV(dst, tmp);
> >>+          } else {
> >>+            GBE_ASSERT(0);
> >>+          }
> >>+        }
> >>+      }
> >>+      break;
> >>        default:
> >>          NOT_IMPLEMENTED;
> >>      }
> >>diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> >>index d100f80..2b166b1 100644
> >>--- a/backend/src/backend/gen_insn_selection.cpp
> >>+++ b/backend/src/backend/gen_insn_selection.cpp
> >>@@ -498,6 +498,7 @@ namespace gbe
> >>      ALU1(RNDE)
> >>      ALU1(F16TO32)
> >>      ALU1(F32TO16)
> >>+    ALU1WithTemp(BSWAP)
> >>      ALU2(SEL)
> >>      ALU2(SEL_INT64)
> >>      ALU1(NOT)
> >>@@ -2121,6 +2122,14 @@ namespace gbe
> >>            case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
> >>            case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
> >>            case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
> >>+          case ir::OP_BSWAP:
> >>+            {
> >>+              ir::Register tmp = sel.reg(getFamily(insnType));
> >>+              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
> >>+              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
> >>+              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
> >>+              break;
> >>+            }
> >>            case ir::OP_SIMD_ANY:
> >>              {
> >>                const GenRegister constZero = GenRegister::immuw(0);;
> >>diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
> >>index be1f7ec..09f5aaf 100644
> >>--- a/backend/src/backend/gen_insn_selection.hxx
> >>+++ b/backend/src/backend/gen_insn_selection.hxx
> >>@@ -1,5 +1,6 @@
> >>  DECL_SELECTION_IR(LABEL, LabelInstruction)
> >>  DECL_SELECTION_IR(MOV, UnaryInstruction)
> >>+DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
> >>  DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
> >>  DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
> >>  DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
> >>-- 
> >>1.7.9.5
> >>
> >>_______________________________________________
> >>Beignet mailing list
> >>Beignet at lists.freedesktop.org
> >>http://lists.freedesktop.org/mailman/listinfo/beignet
> >_______________________________________________
> >Beignet mailing list
> >Beignet at lists.freedesktop.org
> >http://lists.freedesktop.org/mailman/listinfo/beignet
> 
> 
> 

From junyan.he at inbox.com  Mon Mar  9 01:10:44 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:10:44 +0800
Subject: [Beignet] [PATCH 1/9 V3] Backend: Add the indirect fields and
	functions for gen register.
Message-ID: <1425888644-1066-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Add a0_subnr and addr_imm to GenRegister, in order to
represent the indirect register, which may be some
imm offset from a0.x subregister's base address.
Also add to_indirect1xN help function to convert a register
to an indirect 1XN register.

V3:
   1. Add Gen8 encoder setting.
   2. Reorder the patches.
   3. Add logic for gen8 context, using 16 a0 sub-registers.
   4. Fix some bugs of uniform src.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_register.hpp |   30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 08c7277..3b40b67 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -205,6 +205,8 @@ namespace gbe
       this->quarter = 0;
       this->nr = this->subnr = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! For specific physical registers only */
@@ -229,6 +231,8 @@ namespace gbe
       this->hstride = hstride;
       this->quarter = 0;
       this->address_mode = GEN_ADDRESS_DIRECT;
+      this->a0_subnr = 0;
+      this->addr_imm = 0;
     }
 
     /*! Return the IR virtual register */
@@ -258,6 +262,8 @@ namespace gbe
     uint32_t hstride:2;      //!< Horizontal stride
     uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
     uint32_t address_mode:1; //!< direct or indirect
+    uint32_t a0_subnr:4;     //!< In indirect mode, use a0.nr as the base.
+    int32_t addr_imm:10;     //!< In indirect mode, the imm as address offset from a0.
 
     static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
       GenRegister r = reg;
@@ -835,6 +841,28 @@ namespace gbe
       return reg;
     }
 
+    /*! convert one register to indirectly mode */
+    static INLINE GenRegister to_indirect1xN(GenRegister reg, uint32_t base_addr,
+                                          int32_t imm_off = 4096, int a0_subnr = 0) {
+      GenRegister r = reg;
+      int32_t offset;
+      if (imm_off > 4095) {
+        offset = (r.nr*32 + r.subnr) - base_addr;
+      } else {
+        offset = imm_off;
+      }
+
+      GBE_ASSERT(offset <= 511 && offset>=-512);
+      r.a0_subnr = a0_subnr;
+      r.addr_imm = offset;
+      r.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+
+      r.width = GEN_WIDTH_1;
+      r.vstride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      r.hstride = GEN_HORIZONTAL_STRIDE_0;
+      return r;
+    }
+
     static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
       return GenRegister(file,
                          nr,
@@ -953,7 +981,7 @@ namespace gbe
     }
 
     static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
-      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+      return offset(retype(vec1(file, nr, 0), GEN_TYPE_UW), 0, typeSize(GEN_TYPE_UW)*subnr);
     }
 
     static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:10:50 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:10:50 +0800
Subject: [Beignet] [PATCH 2/9 V3] Backend: Add functions to set a0 register.
Message-ID: <1425888650-1101-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

a0 as a address register acts a very important role in
indirect mode access. We add auxiliary functions to set
its content correctly and effectively.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp |   43 +++++++++++++++++++++++++++++++++++
 backend/src/backend/gen_context.hpp |    3 +++
 2 files changed, 46 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index f8748ad..411cf3f 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -51,6 +51,7 @@ namespace gbe
     this->ra = NULL;
     this->ifEndifFix = false;
     this->regSpillTick = 0;
+    memset(a0, 0, sizeof(a0));
   }
 
   GenContext::~GenContext(void) {
@@ -1801,6 +1802,48 @@ namespace gbe
     p->TYPED_WRITE(header, true, bti);
   }
 
+  void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+
+    if (sz == 0)
+      sz = 8;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(a0[0] + diff < 4096 && a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset && sz == 8) {
+      p->push();
+      p->curr.execWidth = 8;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/2; i++) {
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+            GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
+  }
+
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index f64b916..6ca88db 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -208,6 +208,9 @@ namespace gbe
     /*! allocate a new curbe register and insert to curbe pool. */
     void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
 
+    uint16_t a0[16];
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:10:55 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:10:55 +0800
Subject: [Beignet] [PATCH 3/9 V3] Backend: Correct indirect mode encoder
	setting for Gen7.
Message-ID: <1425888655-1141-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen7_encoder.cpp |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
index ecf5b39..a7d132c 100644
--- a/backend/src/backend/gen7_encoder.cpp
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -118,14 +118,14 @@ namespace gbe
     } else {
       gen7_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen7_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen7_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen7_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen7_insn->bits2.ia1.src0_abs = 0;
-      gen7_insn->bits2.ia1.src0_negate = 0;
+      gen7_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen7_insn->bits2.ia1.src0_indirect_offset = reg.addr_imm;
+      gen7_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen7_insn->bits2.ia1.src0_negate = reg.negation;
       gen7_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen7_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen7_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen7_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen7_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen7_insn->bits2.ia1.src0_width = reg.width;
+      gen7_insn->bits2.ia1.src0_vert_stride = reg.vstride;
     }
   }
 
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:11:02 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:11:02 +0800
Subject: [Beignet] [PATCH 4/9 V3] Backend: Correct indirect mode encoder
	setting for Gen8.
Message-ID: <1425888662-1176-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen8_encoder.cpp |   15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
index 92aad64..48419aa 100644
--- a/backend/src/backend/gen8_encoder.cpp
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -369,14 +369,15 @@ namespace gbe
     } else {
       gen8_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
       gen8_insn->bits1.ia1.src0_reg_type = reg.type;
-      gen8_insn->bits2.ia1.src0_subreg_nr = 0;
-      gen8_insn->bits2.ia1.src0_indirect_offset = 0;
-      gen8_insn->bits2.ia1.src0_abs = 0;
-      gen8_insn->bits2.ia1.src0_negate = 0;
+      gen8_insn->bits2.ia1.src0_subreg_nr = reg.a0_subnr;
+      gen8_insn->bits2.ia1.src0_indirect_offset = (reg.addr_imm & 0x1ff);
+      gen8_insn->bits2.ia1.src0_abs = reg.absolute;
+      gen8_insn->bits2.ia1.src0_negate = reg.negation;
       gen8_insn->bits2.ia1.src0_address_mode = reg.address_mode;
-      gen8_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-      gen8_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-      gen8_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+      gen8_insn->bits2.ia1.src0_horiz_stride = reg.hstride;
+      gen8_insn->bits2.ia1.src0_width = reg.width;
+      gen8_insn->bits2.ia1.src0_vert_stride = reg.vstride;
+      gen8_insn->bits2.ia1.src0_indirect_offset_9 = (reg.addr_imm & 0x02) >> 9;
     }
   }
 
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:11:08 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:11:08 +0800
Subject: [Beignet] [PATCH 5/9 V3] Backend: Handle the bswap using indirect
	mode access.
Message-ID: <1425888668-1217-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_context.cpp        |  130 ++++++++++++++++++++++++++++
 backend/src/backend/gen_insn_selection.cpp |    9 ++
 2 files changed, 139 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 411cf3f..cdf581c 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -298,6 +298,136 @@ namespace gbe
           p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
         break;
       }
+      case SEL_OP_BSWAP: {
+        uint32_t simd = p->curr.execWidth;
+        GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+        uint16_t new_a0[16];
+        memset(new_a0, 0, sizeof(new_a0));
+
+        GBE_ASSERT(src.type == dst.type);
+        uint32_t start_addr = src.nr*32 + src.subnr;
+
+        if (simd == 1) {
+          GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+              && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            this->setA0Content(new_a0, 0, 4);
+
+            p->push();
+            p->curr.execWidth = 4;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            GenRegister dst_ = dst;
+            dst_.type = GEN_TYPE_UB;
+            dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+            dst_.width = GEN_WIDTH_4;
+            dst_.vstride = GEN_VERTICAL_STRIDE_4;
+            p->MOV(dst_, ind_src);
+            p->pop();
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+            p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                GenRegister::retype(src, GEN_TYPE_UB));
+          } else {
+            GBE_ASSERT(0);
+          }
+        } else {
+          if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+            bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+            GBE_ASSERT(uniform_src || src.subnr == 0);
+            GBE_ASSERT(dst.subnr == 0);
+            GBE_ASSERT(tmp.subnr == 0);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 3;
+            new_a0[1] = start_addr + 2;
+            new_a0[2] = start_addr + 1;
+            new_a0[3] = start_addr;
+            if (!uniform_src) {
+              new_a0[4] = start_addr + 7;
+              new_a0[5] = start_addr + 6;
+              new_a0[6] = start_addr + 5;
+              new_a0[7] = start_addr + 4;
+            } else {
+              new_a0[4] = start_addr + 3;
+              new_a0[5] = start_addr + 2;
+              new_a0[6] = start_addr + 1;
+              new_a0[7] = start_addr;
+            }
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < 4; i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            if (simd == 16) {
+              for (int i = 0; i < 4; i++) {
+                ind_src.addr_imm += 8;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 8*i), ind_src);
+              }
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+            bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+            GBE_ASSERT(uniform_src || src.subnr == 0 || src.subnr == 16);
+            GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+            GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+            GBE_ASSERT(start_addr >= 0);
+            new_a0[0] = start_addr + 1;
+            new_a0[1] = start_addr;
+            if (!uniform_src) {
+              new_a0[2] = start_addr + 3;
+              new_a0[3] = start_addr + 2;
+              new_a0[4] = start_addr + 5;
+              new_a0[5] = start_addr + 4;
+              new_a0[6] = start_addr + 7;
+              new_a0[7] = start_addr + 6;
+            } else {
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              new_a0[4] = start_addr + 1;
+              new_a0[5] = start_addr;
+              new_a0[6] = start_addr + 1;
+              new_a0[7] = start_addr;
+            }
+            this->setA0Content(new_a0, 56);
+
+            p->push();
+            p->curr.execWidth = 8;
+            p->curr.predicate = GEN_PREDICATE_NONE;
+            p->curr.noMask = 1;
+            GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                a0[0], new_a0[0] - a0[0]);
+            p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+            for (int i = 1; i < (simd == 8 ? 2 : 4); i++) {
+              ind_src.addr_imm += 8;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 8*i), ind_src);
+            }
+            p->pop();
+
+            p->MOV(dst, tmp);
+          } else {
+            GBE_ASSERT(0);
+          }
+        }
+      }
+      break;
       default:
         NOT_IMPLEMENTED;
     }
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d100f80..2b166b1 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -498,6 +498,7 @@ namespace gbe
     ALU1(RNDE)
     ALU1(F16TO32)
     ALU1(F32TO16)
+    ALU1WithTemp(BSWAP)
     ALU2(SEL)
     ALU2(SEL_INT64)
     ALU1(NOT)
@@ -2121,6 +2122,14 @@ namespace gbe
           case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
           case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
           case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+          case ir::OP_BSWAP:
+            {
+              ir::Register tmp = sel.reg(getFamily(insnType));
+              const GenRegister src_ = GenRegister::retype(src, getGenType(insnType));
+              const GenRegister dst_ = GenRegister::retype(dst, getGenType(insnType));
+              sel.BSWAP(dst_, src_, sel.selReg(tmp, insnType));
+              break;
+            }
           case ir::OP_SIMD_ANY:
             {
               const GenRegister constZero = GenRegister::immuw(0);;
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:11:15 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:11:15 +0800
Subject: [Beignet] [PATCH 6/9 V3] Backend: Handle the bswap using indirect
	mode access.
Message-ID: <1425888675-1257-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen_insn_selection.hxx |    1 +
 1 file changed, 1 insertion(+)

diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index be1f7ec..09f5aaf 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -1,5 +1,6 @@
 DECL_SELECTION_IR(LABEL, LabelInstruction)
 DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(BSWAP, UnaryWithTempInstruction)
 DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
 DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:11:22 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:11:22 +0800
Subject: [Beignet] [PATCH 7/9 V3] Add a0 setting and bswap logic for GEN8
Message-ID: <1425888682-1293-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Because Gen8 has 16 sub-registers for A0, we can use
them to decrease the instructions number.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen8_context.cpp |  174 ++++++++++++++++++++++++++++++++++
 backend/src/backend/gen8_context.hpp |    1 +
 2 files changed, 175 insertions(+)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 4edfd81..0d4a40e 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -65,11 +65,141 @@ namespace gbe
 
   void Gen8Context::emitUnaryWithTempInstruction(const SelectionInstruction &insn)
   {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister tmp = ra->genReg(insn.dst(1));
     switch (insn.opcode) {
       case SEL_OP_CONVI_TO_I64:
         /* Should never come to here, just use the common OPCODE. */
         GBE_ASSERT(0);
         break;
+      case SEL_OP_BSWAP:
+        {
+          uint32_t simd = p->curr.execWidth;
+          GBE_ASSERT(simd == 8 || simd == 16 || simd == 1);
+          uint16_t new_a0[16];
+          memset(new_a0, 0, sizeof(new_a0));
+
+          GBE_ASSERT(src.type == dst.type);
+          uint32_t start_addr = src.nr*32 + src.subnr;
+
+          if (simd == 1) {
+            GBE_ASSERT(src.hstride == GEN_HORIZONTAL_STRIDE_0
+                && dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+            if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              this->setA0Content(new_a0, 0, 4);
+
+              p->push();
+              p->curr.execWidth = 4;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                  a0[0], new_a0[0] - a0[0]);
+              GenRegister dst_ = dst;
+              dst_.type = GEN_TYPE_UB;
+              dst_.hstride = GEN_HORIZONTAL_STRIDE_1;
+              dst_.width = GEN_WIDTH_4;
+              dst_.vstride = GEN_VERTICAL_STRIDE_4;
+              p->MOV(dst_, ind_src);
+              p->pop();
+            } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+              p->MOV(GenRegister::retype(dst, GEN_TYPE_UB),
+                  GenRegister::retype(GenRegister::offset(src, 0, 1), GEN_TYPE_UB));
+              p->MOV(GenRegister::retype(GenRegister::offset(dst, 0, 1), GEN_TYPE_UB),
+                  GenRegister::retype(src, GEN_TYPE_UB));
+            } else {
+              GBE_ASSERT(0);
+            }
+          } else {
+            if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
+              GBE_ASSERT(src.subnr == 0);
+              GBE_ASSERT(dst.subnr == 0);
+              GBE_ASSERT(tmp.subnr == 0);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 3;
+              new_a0[1] = start_addr + 2;
+              new_a0[2] = start_addr + 1;
+              new_a0[3] = start_addr;
+              new_a0[4] = start_addr + 7;
+              new_a0[5] = start_addr + 6;
+              new_a0[6] = start_addr + 5;
+              new_a0[7] = start_addr + 4;
+              new_a0[8] = start_addr + 11;
+              new_a0[9] = start_addr + 10;
+              new_a0[10] = start_addr + 9;
+              new_a0[11] = start_addr + 8;
+              new_a0[12] = start_addr + 15;
+              new_a0[13] = start_addr + 14;
+              new_a0[14] = start_addr + 13;
+              new_a0[15] = start_addr + 12;
+              this->setA0Content(new_a0, 48);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                  a0[0], new_a0[0] - a0[0]);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              ind_src.addr_imm += 16;
+              p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              if (simd == 16) {
+                for (int i = 0; i < 2; i++) {
+                  ind_src.addr_imm += 16;
+                  p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 1, 16*i), ind_src);
+                }
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
+            } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
+              GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
+              GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
+              GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
+              GBE_ASSERT(start_addr >= 0);
+              new_a0[0] = start_addr + 1;
+              new_a0[1] = start_addr;
+              new_a0[2] = start_addr + 3;
+              new_a0[3] = start_addr + 2;
+              new_a0[4] = start_addr + 5;
+              new_a0[5] = start_addr + 4;
+              new_a0[6] = start_addr + 7;
+              new_a0[7] = start_addr + 6;
+              new_a0[8] = start_addr + 9;
+              new_a0[9] = start_addr + 8;
+              new_a0[10] = start_addr + 11;
+              new_a0[11] = start_addr + 10;
+              new_a0[12] = start_addr + 13;
+              new_a0[13] = start_addr + 12;
+              new_a0[14] = start_addr + 15;
+              new_a0[15] = start_addr + 14;
+              this->setA0Content(new_a0, 48);
+
+              p->push();
+              p->curr.execWidth = 16;
+              p->curr.predicate = GEN_PREDICATE_NONE;
+              p->curr.noMask = 1;
+              GenRegister ind_src = GenRegister::to_indirect1xN(GenRegister::retype(src, GEN_TYPE_UB),
+                  a0[0], new_a0[0] - a0[0]);
+              p->MOV(GenRegister::retype(tmp, GEN_TYPE_UB), ind_src);
+              if (simd == 16) {
+                ind_src.addr_imm += 16;
+                p->MOV(GenRegister::offset(GenRegister::retype(tmp, GEN_TYPE_UB), 0, 16), ind_src);
+              }
+              p->pop();
+
+              p->MOV(dst, tmp);
+            } else {
+              GBE_ASSERT(0);
+            }
+          }
+        }
+        break;
       default:
         GenContext::emitUnaryWithTempInstruction(insn);
     }
@@ -782,4 +912,48 @@ namespace gbe
     GBE_ASSERT(dst.hstride != GEN_HORIZONTAL_STRIDE_0 && src.hstride != GEN_HORIZONTAL_STRIDE_0);
     this->unpackLongVec(src, dst, p->curr.execWidth);
   }
+
+  void Gen8Context::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(this->a0[0] + diff < 4096 && this->a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset && sz == 16) {
+      p->push();
+      p->curr.execWidth = 16;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/4; i++) {
+        uint64_t addr = (new_a0[i*4 + 3] << 16) | (new_a0[i*4 + 2]);
+        addr = addr << 32;
+        addr = addr | (new_a0[i*4 + 1] << 16) | (new_a0[i*4]);
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*4), GEN_TYPE_UL), GenRegister::immuint64(addr));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index a047990..b296a3d 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -72,6 +72,7 @@ namespace gbe
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
 
   protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
     virtual GenEncoder* generateEncoder(void) {
       return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
     }
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:11:28 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:11:28 +0800
Subject: [Beignet] [PATCH 8/9 V3] Backend: Delete bswap logic in the
	llvm_to_gen stage.
Message-ID: <1425888688-1328-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

We move the bswap logic from llvm_to_gen to backend for
efficienc using indirect mode.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/ir/instruction.hpp        |    2 +
 backend/src/ir/instruction.hxx        |    1 +
 backend/src/llvm/llvm_gen_backend.cpp |   85 +--------------------------------
 3 files changed, 5 insertions(+), 83 deletions(-)

diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 6963111..24d27aa 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -620,6 +620,8 @@ namespace ir {
   Instruction RNDU(Type type, Register dst, Register src);
   /*! rndz.type dst src */
   Instruction RNDZ(Type type, Register dst, Register src);
+  /*! bswap.type dst src */
+  Instruction BSWAP(Type type, Register dst, Register src);
   /*! pow.type dst src0 src1 */
   Instruction POW(Type type, Register dst, Register src0, Register src1);
   /*! mul.type dst src0 src1 */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index b52673e..de4abfb 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -40,6 +40,7 @@ DECL_INSN(RNDU, UnaryInstruction)
 DECL_INSN(RNDZ, UnaryInstruction)
 DECL_INSN(SIMD_ANY, UnaryInstruction)
 DECL_INSN(SIMD_ALL, UnaryInstruction)
+DECL_INSN(BSWAP, UnaryInstruction)
 DECL_INSN(POW, BinaryInstruction)
 DECL_INSN(MUL, BinaryInstruction)
 DECL_INSN(ADD, BinaryInstruction)
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index aad638f..74c80ee 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2943,89 +2943,6 @@ namespace gbe
           case Intrinsic::umul_with_overflow:
           NOT_IMPLEMENTED;
           break;
-          case Intrinsic::bswap:
-          {
-            // FIXME, this is an unoptimized version, could be optimized by
-            // leveraging GEN's register region/indirect address feature.
-            Type *llvmDstType = I.getType();
-            uint32_t elementSize = getTypeByteSize(unit, llvmDstType);
-
-            const ir::Register dst0  = this->getRegister(&I);
-            const ir::Register src0 = this->getRegister(I.getOperand(0));
-            switch(elementSize)
-            {
-              case 2:
-                {
-                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regWMask = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x00FF, ir::TYPE_S16);
-                  ir::Register regShift = ctx.reg( ir::FAMILY_WORD );
-                  const ir::ImmediateIndex shift = ctx.newIntegerImmediate(8, ir::TYPE_S16);
-
-                  ctx.LOADI(ir::TYPE_S16, regWMask, wMask);
-                  ctx.AND(srcType, tmp1, src0, regWMask);
-
-                  ctx.LOADI(ir::TYPE_S16, regShift, shift);
-                  ctx.SHL(srcType, tmp2, tmp1, regShift);
-
-                  ir::Register tmp3 = ctx.reg( getFamily(srcType) );
-                  ctx.SHR(srcType, tmp3, src0, regShift);
-
-                  ctx.OR(srcType, dst0, tmp2, tmp3);
-                }
-                break;
-              case 4:
-                {
-                  ir::Type srcType = getType(ctx, llvmDstType);
-                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp3 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp4 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp5 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp6 = ctx.reg(getFamily(srcType));
-
-                  ir::Register regDWMask = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_8 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::Register regShift_24 = ctx.reg( ir::FAMILY_DWORD );
-                  ir::ImmediateIndex wMask_L = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
-                  ir::ImmediateIndex wMask_H = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_8 = ctx.newIntegerImmediate(8, ir::TYPE_S32);
-                  ir::ImmediateIndex shift_24 = ctx.newIntegerImmediate(24, ir::TYPE_S32);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHL(srcType, tmp1, src0, regShift_24);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_L);
-                  ctx.AND(srcType, tmp2, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHL(srcType, tmp3, tmp2, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask_H);
-                  ctx.AND(srcType, tmp4, src0, regDWMask);
-                  ctx.LOADI(ir::TYPE_S32, regShift_8, shift_8);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp5, tmp4, regShift_8);
-
-                  ctx.LOADI(ir::TYPE_S32, regShift_24, shift_24);
-                  ctx.SHR(makeTypeUnsigned(srcType), tmp6, src0, regShift_24);
-
-                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
-                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
-                  ctx.OR(srcType, tmp7, tmp1, tmp3);
-                  ctx.OR(srcType, tmp8, tmp5, tmp6);
-                  ctx.OR(srcType, dst0, tmp7, tmp8);
-                }
-                break;
-              case 8:
-                NOT_IMPLEMENTED;
-                break;
-              default:
-                GBE_ASSERT(0);
-            }
-          }
-          break;
           case Intrinsic::ctlz:
           {
             Type *llvmDstType = I.getType();
@@ -3085,6 +3002,8 @@ namespace gbe
           case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
           case Intrinsic::exp2: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
+          case Intrinsic::bswap:
+            this->emitUnaryCallInst(I,CS,ir::OP_BSWAP, getUnsignedType(ctx, I.getType())); break;
           default: NOT_IMPLEMENTED;
         }
       } else {
-- 
1.7.9.5


From junyan.he at inbox.com  Mon Mar  9 01:11:35 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon,  9 Mar 2015 16:11:35 +0800
Subject: [Beignet] [PATCH 9/9 V3] Modify the utest case for bswap.
Message-ID: <1425888695-1367-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

We add the test case for uniform when doing the bswap.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 kernels/compiler_bswap.cl |   24 +++---
 utests/compiler_bswap.cpp |  203 +++++++++++++++++++++++++++++++--------------
 2 files changed, 156 insertions(+), 71 deletions(-)

diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
index 97313b1..3a0a373 100644
--- a/kernels/compiler_bswap.cl
+++ b/kernels/compiler_bswap.cl
@@ -1,13 +1,17 @@
-#define TEST_TYPE(TYPE, LENGTH)                                       \
-kernel void compiler_bswap_##TYPE(global TYPE * src, global TYPE * dst){ \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(src[get_global_id(0)]); \
-   dst[get_global_id(0)]= __builtin_bswap##LENGTH(dst[get_global_id(0)] -1 ); \
-}
+kernel void compiler_bswap(global uint * src0, global uint * dst0, global ushort * src1, global ushort * dst1,
+    int src2, global int * dst2,  short src3, global short * dst3) {
+  if (get_global_id(0) % 2 == 0) {
+    dst0[get_global_id(0)] = __builtin_bswap32(src0[get_global_id(0)]);
+  } else {
+    dst0[get_global_id(0)] = src0[get_global_id(0)];
+  }
 
+  dst1[get_global_id(0)] = __builtin_bswap16(src1[get_global_id(0)]);
+  if (get_global_id(0) % 2 == 1) {
+    dst1[get_global_id(0)] = __builtin_bswap16(dst1[get_global_id(0)] + 1);
+  }
 
-TEST_TYPE(short, 16)
-TEST_TYPE(ushort, 16)
-TEST_TYPE(int, 32)
-TEST_TYPE(uint, 32)
+  dst2[get_global_id(0)] = __builtin_bswap32(src2);
+  dst3[get_global_id(0)] = __builtin_bswap16(src3);
+}
 
-#undef TEST_TYPE
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
index 9475b99..3af9ef5 100644
--- a/utests/compiler_bswap.cpp
+++ b/utests/compiler_bswap.cpp
@@ -1,7 +1,6 @@
 #include "utest_helper.hpp"
 #include "string.h"
 
-namespace {
 #define cpu_htons(A)     ((((uint16_t)(A) & 0xff00) >> 8) | \
     (((uint16_t)(A) & 0x00ff) << 8))
 #define cpu_htonl(A)     ((((uint32_t)(A) & 0xff000000) >> 24) | \
@@ -9,108 +8,190 @@ namespace {
     (((uint32_t)(A) & 0x0000ff00) << 8) | \
     (((uint32_t)(A) & 0x000000ff) << 24))
 
+
+template <typename T> static void gen_rand_val(T & val)
+{
+  val = static_cast<T>(rand());//(0xAABBCCDD);//
+}
+
 template <typename T> static void cpu(int global_id, T *src, T *dst)
 {
-    T f = src[global_id];
-    T g = 0;
-    if(sizeof(T) == sizeof(int16_t))
-      g = cpu_htons(f);
-    else if(sizeof(T) == sizeof(int32_t))
-      g = cpu_htonl(f);
-    dst[global_id] = g;
+  T f = src[global_id];
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T> static void gen_rand_val (T & val)
+template <typename T> static void cpu(int global_id, T src, T *dst)
 {
-    val = static_cast<T>(rand() );
+  T f = src;
+  T g = 0;
+  if (sizeof(T) == sizeof(int16_t))
+    g = cpu_htons(f);
+  else if (sizeof(T) == sizeof(int32_t))
+    g = cpu_htonl(f);
+  dst[global_id] = g;
 }
 
-template <typename T>
-inline static void print_data (T& val)
+template <typename T> inline static void print_data(T& val)
 {
-    if(sizeof(T) == sizeof(uint16_t))
-        printf(" %hx", val);
-    else
-        printf(" %x", val);
+  if(sizeof(T) == sizeof(uint16_t))
+    printf(" 0x%hx", val);
+  else
+    printf(" 0x%x", val);
 }
 
-template <typename T> static void dump_data (T* src, T* dst, int n)
+template <typename T> static void dump_data(T* raw, T* cpu, T* gpu, int n)
 {
-    printf("\nRaw: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[0])[i]);
-    }
+  printf("\nRaw: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(raw[i]);
+  }
 
-    printf("\nCPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(dst[i]);
-    }
-    printf("\nGPU: \n");
-    for (int32_t i = 0; i < (int32_t) n; ++i) {
-        print_data(((T *)buf_data[1])[i]);
-    }
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
 }
 
-template<typename T>
-void test(const char *kernel_name)
+template <typename T> static void dump_data(T raw, T* cpu, T* gpu, int n)
 {
-  const size_t n = 64;
-  T cpu_dst[n];
-  T cpu_src[n];
+  printf("\nRaw: \n");
+  print_data(raw);
+
+  printf("\nCPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(cpu[i]);
+  }
+  printf("\nGPU: \n");
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    print_data(gpu[i]);
+  }
+}
+
+void compiler_bswap(void)
+{
+  const size_t n = 32;
+  uint32_t src0[n];
+  uint16_t src1[n];
+  uint32_t dst0[n];
+  uint16_t dst1[n];
+  int32_t src2 = static_cast<int32_t>(rand());
+  int32_t dst2[n];
+  int16_t src3 = static_cast<int16_t>(rand());
+  int16_t dst3[n];
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", kernel_name);
-  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
-  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", "compiler_bswap");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(src0), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, sizeof(dst0), NULL);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
 
+  OCL_CREATE_BUFFER(buf[2], 0, sizeof(src1), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_CREATE_BUFFER(buf[3], 0, sizeof(dst1), NULL);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+  OCL_SET_ARG(4, sizeof(int32_t), &src2);
+  OCL_CREATE_BUFFER(buf[4], 0, sizeof(dst2), NULL);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+
+  OCL_SET_ARG(6, sizeof(int16_t), &src3);
+  OCL_CREATE_BUFFER(buf[5], 0, sizeof(dst3), NULL);
+  OCL_SET_ARG(7, sizeof(cl_mem), &buf[5]);
+
   OCL_MAP_BUFFER(0);
   for (int32_t i = 0; i < (int32_t) n; ++i) {
-    gen_rand_val(cpu_src[i]);
+    gen_rand_val(src0[i]);
   }
-
-  memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+  memcpy(buf_data[0], src0, sizeof(src0));
+  OCL_UNMAP_BUFFER(0);
 
   /* Clear the dst buffer to avoid random data. */
   OCL_MAP_BUFFER(1);
-  memset(buf_data[1], 0, sizeof(T) * n);
+  memset(buf_data[1], 0, sizeof(dst0));
   OCL_UNMAP_BUFFER(1);
 
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    gen_rand_val(src1[i]);
+  }
+  memcpy(buf_data[2], src1, sizeof(src1));
+  OCL_UNMAP_BUFFER(2);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(3);
+  memset(buf_data[3], 0, sizeof(dst1));
+  OCL_UNMAP_BUFFER(3);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(4);
+  memset(buf_data[4], 0, sizeof(dst2));
+  OCL_UNMAP_BUFFER(4);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(5);
+  memset(buf_data[5], 0, sizeof(dst3));
+  OCL_UNMAP_BUFFER(5);
+
   globals[0] = n;
   locals[0] = 16;
   OCL_NDRANGE(1);
 
   // Run on CPU
-  for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_src, cpu_dst);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    if (i%2) {
+      dst0[i] = src0[i];
+      continue;
+    }
+    cpu(i, src0, dst0);
+  }
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu(i, src1, dst1);
+
+    if (i%2) {
+      dst1[i] = dst1[i] + 1;
+      cpu(i, dst1, dst1);
+    }
+  }
 
+  // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu_dst[i] = cpu_dst[i] -1;
+    cpu(i, src2, dst2);
 
   // Run on CPU
   for (int32_t i = 0; i < (int32_t) n; ++i)
-    cpu(i, cpu_dst, cpu_dst);
+    cpu(i, src3, dst3);
 
   OCL_MAP_BUFFER(1);
- // dump_data(cpu_src, cpu_dst, n);
+  //dump_data(src0, dst0, (uint32_t *)buf_data[1], n);
+  OCL_ASSERT(!memcmp(buf_data[1], dst0, sizeof(dst0)));
+  OCL_UNMAP_BUFFER(1);
 
-  OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+  OCL_MAP_BUFFER(3);
+  //dump_data(src1, dst1, (uint16_t *)buf_data[3], n);
+  OCL_ASSERT(!memcmp(buf_data[3], dst1, sizeof(dst1)));
+  OCL_UNMAP_BUFFER(3);
 
-  OCL_UNMAP_BUFFER(1);
-  OCL_UNMAP_BUFFER(0);
-}
+  OCL_MAP_BUFFER(4);
+  //dump_data(src2, dst2, (int32_t *)buf_data[4], n);
+  OCL_ASSERT(!memcmp(buf_data[4], dst2, sizeof(dst2)));
+  OCL_UNMAP_BUFFER(4);
 
+  OCL_MAP_BUFFER(5);
+  //dump_data(src3, dst3, (int16_t *)buf_data[5], n);
+  OCL_ASSERT(!memcmp(buf_data[5], dst3, sizeof(dst3)));
+  OCL_UNMAP_BUFFER(5);
 }
 
-#define compiler_bswap(type, kernel) \
-static void compiler_bswap_ ##type(void)\
-{\
-  test<type>(# kernel);\
-}\
-MAKE_UTEST_FROM_FUNCTION(compiler_bswap_ ## type);
-
-compiler_bswap(int16_t, compiler_bswap_short)
-compiler_bswap(uint16_t, compiler_bswap_ushort)
-compiler_bswap(int32_t, compiler_bswap_int)
-compiler_bswap(uint32_t, compiler_bswap_uint)
+MAKE_UTEST_FROM_FUNCTION(compiler_bswap);
-- 
1.7.9.5


From zhigang.gong at linux.intel.com  Mon Mar  9 01:29:50 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 9 Mar 2015 16:29:50 +0800
Subject: [Beignet] [PATCH 2/4] change the workitem related api to
 OVERLOABABLE.
In-Reply-To: <1425871468-11096-2-git-send-email-xionghu.luo@intel.com>
References: <1425871468-11096-1-git-send-email-xionghu.luo@intel.com>
 <1425871468-11096-2-git-send-email-xionghu.luo@intel.com>
Message-ID: <20150309082950.GD20578@ivb-gt2-rev4>

On Mon, Mar 09, 2015 at 11:24:26AM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> the SPIR header file requirs these functions to be overlable.
> (https://github.com/KhronosGroup/SPIR-Tools/blob/master/headers/opencl_spir.h)
> 
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/libocl/include/ocl_async.h    |  2 +-
>  backend/src/libocl/include/ocl_sync.h     |  2 +-
>  backend/src/libocl/include/ocl_types.h    |  2 --
>  backend/src/libocl/include/ocl_workitem.h | 16 ++++++++--------
>  backend/src/libocl/src/ocl_async.cl       |  2 +-
>  backend/src/libocl/src/ocl_barrier.ll     |  2 +-
>  backend/src/libocl/src/ocl_workitem.cl    |  6 +++---
>  kernels/compiler_async_copy.cl            |  4 ++--
>  8 files changed, 17 insertions(+), 19 deletions(-)
> 
> diff --git a/backend/src/libocl/include/ocl_async.h b/backend/src/libocl/include/ocl_async.h
> index dd89942..9d5cc06 100644
> --- a/backend/src/libocl/include/ocl_async.h
> +++ b/backend/src/libocl/include/ocl_async.h
> @@ -45,7 +45,7 @@ DEF(double)
>  #undef DEFN
>  #undef DEF
>  
> -void wait_group_events (int num_events, event_t *event_list);
> +OVERLOADABLE void wait_group_events (int num_events, event_t *event_list);
>  
>  #define DEFN(TYPE) \
>  OVERLOADABLE void prefetch(const global TYPE *p, size_t num);
> diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
> index ed7c6e4..18090d5 100644
> --- a/backend/src/libocl/include/ocl_sync.h
> +++ b/backend/src/libocl/include/ocl_sync.h
> @@ -27,7 +27,7 @@
>  #define CLK_GLOBAL_MEM_FENCE (1 << 1)
>  
>  typedef uint cl_mem_fence_flags;
> -void barrier(cl_mem_fence_flags flags);
> +OVERLOADABLE void barrier(cl_mem_fence_flags flags);
>  void mem_fence(cl_mem_fence_flags flags);
>  void read_mem_fence(cl_mem_fence_flags flags);
>  void write_mem_fence(cl_mem_fence_flags flags);
> diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
> index 487fe68..ae0236b 100644
> --- a/backend/src/libocl/include/ocl_types.h
> +++ b/backend/src/libocl/include/ocl_types.h
> @@ -87,7 +87,5 @@ DEF(double);
>  // FIXME:
>  // This is a transitional hack to bypass the LLVM 3.3 built-in types.
>  // See the Khronos SPIR specification for handling of these types.
> -typedef size_t __event_t;
> -#define event_t __event_t;
This change requires to modify the corresponding utest cases. async_work_group_strided_copy.
It's trivial I will do that before push the whole patchset. All all the other parts LGTM.

Thanks,
Zhigang Gong.

>  
>  #endif /* __OCL_TYPES_H__ */
> diff --git a/backend/src/libocl/include/ocl_workitem.h b/backend/src/libocl/include/ocl_workitem.h
> index 7534ee8..84bb1fb 100644
> --- a/backend/src/libocl/include/ocl_workitem.h
> +++ b/backend/src/libocl/include/ocl_workitem.h
> @@ -20,13 +20,13 @@
>  
>  #include "ocl_types.h"
>  
> -uint get_work_dim(void);
> -uint get_global_size(uint dimindx);
> -uint get_global_id(uint dimindx);
> -uint get_local_size(uint dimindx);
> -uint get_local_id(uint dimindx);
> -uint get_num_groups(uint dimindx);
> -uint get_group_id(uint dimindx);
> -uint get_global_offset(uint dimindx);
> +OVERLOADABLE uint get_work_dim(void);
> +OVERLOADABLE uint get_global_size(uint dimindx);
> +OVERLOADABLE uint get_global_id(uint dimindx);
> +OVERLOADABLE uint get_local_size(uint dimindx);
> +OVERLOADABLE uint get_local_id(uint dimindx);
> +OVERLOADABLE uint get_num_groups(uint dimindx);
> +OVERLOADABLE uint get_group_id(uint dimindx);
> +OVERLOADABLE uint get_global_offset(uint dimindx);
>  
>  #endif  /* __OCL_WORKITEM_H__ */
> diff --git a/backend/src/libocl/src/ocl_async.cl b/backend/src/libocl/src/ocl_async.cl
> index 041aaf2..10d0aa4 100644
> --- a/backend/src/libocl/src/ocl_async.cl
> +++ b/backend/src/libocl/src/ocl_async.cl
> @@ -66,7 +66,7 @@ DEF(double)
>  #undef DEFN
>  #undef DEF
>  
> -void wait_group_events (int num_events, event_t *event_list) {
> +OVERLOADABLE void wait_group_events (int num_events, event_t *event_list) {
>    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
>  }
>  
> diff --git a/backend/src/libocl/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
> index 4e55fcb..dc3579c 100644
> --- a/backend/src/libocl/src/ocl_barrier.ll
> +++ b/backend/src/libocl/src/ocl_barrier.ll
> @@ -10,7 +10,7 @@ declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
>  declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
>  declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
>  
> -define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
> +define void @_Z7barrierj(i32 %flags) nounwind noduplicate alwaysinline {
>    %1 = icmp eq i32 %flags, 3
>    br i1 %1, label %barrier_local_global, label %barrier_local_check
>  
> diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
> index f4629f8..6ddc406 100644
> --- a/backend/src/libocl/src/ocl_workitem.cl
> +++ b/backend/src/libocl/src/ocl_workitem.cl
> @@ -18,7 +18,7 @@
>  #include "ocl_workitem.h"
>  
>  PURE CONST uint __gen_ocl_get_work_dim(void);
> -uint get_work_dim(void)
> +OVERLOADABLE uint get_work_dim(void)
>  {
>    return __gen_ocl_get_work_dim();
>  }
> @@ -37,7 +37,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
>  #undef DECL_INTERNAL_WORK_ITEM_FN
>  
>  #define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
> -unsigned NAME(unsigned int dim) {             \
> +OVERLOADABLE unsigned NAME(unsigned int dim) {             \
>    if (dim == 0) return __gen_ocl_##NAME##0();        \
>    else if (dim == 1) return __gen_ocl_##NAME##1();   \
>    else if (dim == 2) return __gen_ocl_##NAME##2();   \
> @@ -52,6 +52,6 @@ DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
>  DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
>  #undef DECL_PUBLIC_WORK_ITEM_FN
>  
> -uint get_global_id(uint dim) {
> +OVERLOADABLE uint get_global_id(uint dim) {
>    return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
>  }
> diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
> index dddde44..4beb436 100644
> --- a/kernels/compiler_async_copy.cl
> +++ b/kernels/compiler_async_copy.cl
> @@ -5,10 +5,10 @@ compiler_async_copy_##TYPE(__global TYPE *dst, __global TYPE *src, __local TYPE
>    event_t event; \
>    int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); \
>    int i; \
> -  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); \
> +  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 ); \
>    wait_group_events( 1, &event ); \
>  \
> -  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); \
> +  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, 0 ); \
>    wait_group_events( 1, &event ); \
>  }
>  
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Mon Mar  9 01:32:16 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 9 Mar 2015 16:32:16 +0800
Subject: [Beignet] [PATCH 1/9 V3] Backend: Add the indirect fields and
 functions for gen register.
In-Reply-To: <1425888644-1066-1-git-send-email-junyan.he@inbox.com>
References: <1425888644-1066-1-git-send-email-junyan.he@inbox.com>
Message-ID: <20150309083215.GE20578@ivb-gt2-rev4>

This patch LGTM. And just as we discussed, I merged 5th and 6th patches.
And will push the whole patchset latter.

Thanks.

On Mon, Mar 09, 2015 at 04:10:44PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> Add a0_subnr and addr_imm to GenRegister, in order to
> represent the indirect register, which may be some
> imm offset from a0.x subregister's base address.
> Also add to_indirect1xN help function to convert a register
> to an indirect 1XN register.
> 
> V3:
>    1. Add Gen8 encoder setting.
>    2. Reorder the patches.
>    3. Add logic for gen8 context, using 16 a0 sub-registers.
>    4. Fix some bugs of uniform src.
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/backend/gen_register.hpp |   30 +++++++++++++++++++++++++++++-
>  1 file changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
> index 08c7277..3b40b67 100644
> --- a/backend/src/backend/gen_register.hpp
> +++ b/backend/src/backend/gen_register.hpp
> @@ -205,6 +205,8 @@ namespace gbe
>        this->quarter = 0;
>        this->nr = this->subnr = 0;
>        this->address_mode = GEN_ADDRESS_DIRECT;
> +      this->a0_subnr = 0;
> +      this->addr_imm = 0;
>      }
>  
>      /*! For specific physical registers only */
> @@ -229,6 +231,8 @@ namespace gbe
>        this->hstride = hstride;
>        this->quarter = 0;
>        this->address_mode = GEN_ADDRESS_DIRECT;
> +      this->a0_subnr = 0;
> +      this->addr_imm = 0;
>      }
>  
>      /*! Return the IR virtual register */
> @@ -258,6 +262,8 @@ namespace gbe
>      uint32_t hstride:2;      //!< Horizontal stride
>      uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
>      uint32_t address_mode:1; //!< direct or indirect
> +    uint32_t a0_subnr:4;     //!< In indirect mode, use a0.nr as the base.
> +    int32_t addr_imm:10;     //!< In indirect mode, the imm as address offset from a0.
>  
>      static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
>        GenRegister r = reg;
> @@ -835,6 +841,28 @@ namespace gbe
>        return reg;
>      }
>  
> +    /*! convert one register to indirectly mode */
> +    static INLINE GenRegister to_indirect1xN(GenRegister reg, uint32_t base_addr,
> +                                          int32_t imm_off = 4096, int a0_subnr = 0) {
> +      GenRegister r = reg;
> +      int32_t offset;
> +      if (imm_off > 4095) {
> +        offset = (r.nr*32 + r.subnr) - base_addr;
> +      } else {
> +        offset = imm_off;
> +      }
> +
> +      GBE_ASSERT(offset <= 511 && offset>=-512);
> +      r.a0_subnr = a0_subnr;
> +      r.addr_imm = offset;
> +      r.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
> +
> +      r.width = GEN_WIDTH_1;
> +      r.vstride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
> +      r.hstride = GEN_HORIZONTAL_STRIDE_0;
> +      return r;
> +    }
> +
>      static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
>        return GenRegister(file,
>                           nr,
> @@ -953,7 +981,7 @@ namespace gbe
>      }
>  
>      static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
> -      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
> +      return offset(retype(vec1(file, nr, 0), GEN_TYPE_UW), 0, typeSize(GEN_TYPE_UW)*subnr);
>      }
>  
>      static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From chris at chris-wilson.co.uk  Mon Mar  9 05:02:18 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Mon, 9 Mar 2015 12:02:18 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
Message-ID: <20150309120218.GD23680@nuc-i3427.alporthouse.com>

On Mon, Mar 09, 2015 at 02:34:46AM +0000, Zou, Nanhai wrote:
> We don't need MAP_FIXED, we just want to avoid address 0 to be allocated.
> 
> Though I think using MAP_FIXED is overkill, will bring much unnecessary complexity on both kernel and beignet side.
> I don't mind if people can provide stable MAP_FIXED patches to resolve this problem a few months or years later.
> 
> At that time, kernel driver can revert the reserve page 0 patch.
> Before that reserve page 0 can benefit all the Beignet user without breaking anything.

The point is that is becomes ABI. So no the kernel can't just revert it.
There is nothing special about address 0 in ether GTT or virtual memory.
If you require a special object allocated at address 0, allocate a
special object at address 0.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From jbarnes at virtuousgeek.org  Mon Mar  9 08:46:15 2015
From: jbarnes at virtuousgeek.org (Jesse Barnes)
Date: Mon, 09 Mar 2015 08:46:15 -0700
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150305210702.GC18784@nuc-i3427.alporthouse.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <20150305210702.GC18784@nuc-i3427.alporthouse.com>
Message-ID: <54FDC047.9090202@virtuousgeek.org>

On 03/05/2015 01:07 PM, Chris Wilson wrote:
> On Thu, Mar 05, 2015 at 04:27:59PM +0100, Daniel Vetter wrote:
>> I recommended exposing the PIN_BIAS since that will work without full
>> ppgtt too. And yeah for full ppgtt we could just use svm where userspace
>> controls the address, but since that's still a bit out we might need a
>> quick interim solution?
> 
> Letting userspace control the address of bo used in a batch is about 2
> patches each of ~100 lines. And it could be used will full-ppgtt before
> svm if mesa wants to take complete control of its layout. I think it is
> one of those useful tools that is likely to find uses far beyond the
> initial justification.

Well we need someone to pick it up and do it; we've already shafted
userspace for several years due to foot dragging on the command
parser...  I hope something as simple as this doesn't stall out.

Jesse


From jbarnes at virtuousgeek.org  Mon Mar  9 08:49:15 2015
From: jbarnes at virtuousgeek.org (Jesse Barnes)
Date: Mon, 09 Mar 2015 08:49:15 -0700
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <54FDC047.9090202@virtuousgeek.org>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <20150305210702.GC18784@nuc-i3427.alporthouse.com>
 <54FDC047.9090202@virtuousgeek.org>
Message-ID: <54FDC0FB.40808@virtuousgeek.org>

On 03/09/2015 08:46 AM, Jesse Barnes wrote:
> On 03/05/2015 01:07 PM, Chris Wilson wrote:
>> On Thu, Mar 05, 2015 at 04:27:59PM +0100, Daniel Vetter wrote:
>>> I recommended exposing the PIN_BIAS since that will work without full
>>> ppgtt too. And yeah for full ppgtt we could just use svm where userspace
>>> controls the address, but since that's still a bit out we might need a
>>> quick interim solution?
>>
>> Letting userspace control the address of bo used in a batch is about 2
>> patches each of ~100 lines. And it could be used will full-ppgtt before
>> svm if mesa wants to take complete control of its layout. I think it is
>> one of those useful tools that is likely to find uses far beyond the
>> initial justification.
> 
> Well we need someone to pick it up and do it; we've already shafted
> userspace for several years due to foot dragging on the command
> parser...  I hope something as simple as this doesn't stall out.

Nevermind, I see you already did it.  Ruiling and Nanhai, can you make
sure Chris's proposed interface will work for you and provide a review
if so?

Thanks,
Jesse


From jeff.mcgee at intel.com  Mon Mar  9 16:06:54 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:06:54 -0700
Subject: [Beignet] [PATCH v2] drm/i915: Export total subslice and EU counts
In-Reply-To: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425942414-27780-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Setup new I915_GETPARAM ioctl entries for subslice total and
EU total. Userspace drivers need these values when constructing
GPGPU commands. This kernel query method is intended to replace
the PCI ID-based tables that userspace drivers currently maintain.
The kernel driver can employ fuse register reads as needed to
ensure the most accurate determination of GT config attributes.
This first became important with Cherryview in which the config
could differ between devices with the same PCI ID.

The kernel detection of these values is device-specific and not
included in this patch. Because zero is not a valid value for any of
these parameters, a value of zero is interpreted as unknown for the
device. Userspace drivers should continue to maintain ID-based tables
for older devices not supported by the new query method.

v2: Increment our I915_GETPARAM indices to fit after REVISION
    which was merged ahead of us.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c | 10 ++++++++++
 include/uapi/drm/i915_drm.h     |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 8e91430..d49ed68 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -153,6 +153,16 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_MMAP_VERSION:
 		value = 1;
 		break;
+	case I915_PARAM_SUBSLICE_TOTAL:
+		value = INTEL_INFO(dev)->subslice_total;
+		if (!value)
+			return -ENODEV;
+		break;
+	case I915_PARAM_EU_TOTAL:
+		value = INTEL_INFO(dev)->eu_total;
+		if (!value)
+			return -ENODEV;
+		break;
 	default:
 		DRM_DEBUG("Unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index b768f3b..8d1be90 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -348,6 +348,8 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_MMAP_VERSION          30
 #define I915_PARAM_HAS_BSD2		 31
 #define I915_PARAM_REVISION              32
+#define I915_PARAM_SUBSLICE_TOTAL	 33
+#define I915_PARAM_EU_TOTAL		 34
 
 typedef struct drm_i915_getparam {
 	int param;
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:13:03 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:13:03 -0700
Subject: [Beignet] [PATCH 1/2 v2] intel: Export total subslice and EU counts
In-Reply-To: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425942784-27957-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Update kernel interface with new I915_GETPARAM ioctl entries for
subslice total and EU total. Add a wrapping function for each
parameter. Userspace drivers need these values when constructing
GPGPU commands. This kernel query method is intended to replace
the PCI ID-based tables that userspace drivers currently maintain.
The kernel driver can employ fuse register reads as needed to
ensure the most accurate determination of GT config attributes.
This first became important with Cherryview in which the config
could differ between devices with the same PCI ID.

The kernel detection of these values is device-specific. Userspace
drivers should continue to maintain ID-based tables for older
devices which return ENODEV when using this query.

v2: remove unnecessary include of <stdbool.h> and increment the
    I915_GETPARAM indices to match updated kernel patch.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 include/drm/i915_drm.h   |  2 ++
 intel/intel_bufmgr.h     |  3 +++
 intel/intel_bufmgr_gem.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index 15dd01d..b037e56 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -340,6 +340,8 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
 #define I915_PARAM_HAS_WT     	 	 27
 #define I915_PARAM_CMD_PARSER_VERSION	 28
+#define I915_PARAM_SUBSLICE_TOTAL	 33
+#define I915_PARAM_EU_TOTAL		 34
 
 typedef struct drm_i915_getparam {
 	int param;
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index be83a56..285919e 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -264,6 +264,9 @@ int drm_intel_get_reset_stats(drm_intel_context *ctx,
 			      uint32_t *active,
 			      uint32_t *pending);
 
+int drm_intel_get_subslice_total(int fd, unsigned int *subslice_total);
+int drm_intel_get_eu_total(int fd, unsigned int *eu_total);
+
 /** @{ Compatibility defines to keep old code building despite the symbol rename
  * from dri_* to drm_intel_*
  */
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index acbfd4a..5a67f53 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -3295,6 +3295,37 @@ drm_intel_reg_read(drm_intel_bufmgr *bufmgr,
 	return ret;
 }
 
+drm_public int
+drm_intel_get_subslice_total(int fd, unsigned int *subslice_total)
+{
+	drm_i915_getparam_t gp;
+	int ret;
+
+	memclear(gp);
+	gp.value = (int*)subslice_total;
+	gp.param = I915_PARAM_SUBSLICE_TOTAL;
+	ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
+	if (ret)
+		return -errno;
+
+	return 0;
+}
+
+drm_public int
+drm_intel_get_eu_total(int fd, unsigned int *eu_total)
+{
+	drm_i915_getparam_t gp;
+	int ret;
+
+	memclear(gp);
+	gp.value = (int*)eu_total;
+	gp.param = I915_PARAM_EU_TOTAL;
+	ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
+	if (ret)
+		return -errno;
+
+	return 0;
+}
 
 /**
  * Annotate the given bo for use in aub dumping.
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:13:04 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:13:04 -0700
Subject: [Beignet] [PATCH 2/2] configure.ac: bump version to 2.4.60 for
	release
In-Reply-To: <1425942784-27957-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>
 <1425942784-27957-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425942784-27957-2-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 8afee83..278f29b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,7 +20,7 @@
 
 AC_PREREQ([2.63])
 AC_INIT([libdrm],
-        [2.4.59],
+        [2.4.60],
         [https://bugs.freedesktop.org/enter_bug.cgi?product=DRI],
         [libdrm])
 
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:19:31 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:19:31 -0700
Subject: [Beignet] [PATCH i-g-t 1/2] tests/core_getparams: Create new test
	core_getparams
In-Reply-To: <1425339642-18988-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339642-18988-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

New test core_getparams consists of 2 subtests, each one testing
the ability of userspace to query the correct value of a GT config
attribute: subslice total or EU total. drm/i915 implementation of
these queries is required for Cherryview and Gen9+ devices (non-
simulated).

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 tests/.gitignore       |   1 +
 tests/Makefile.sources |   1 +
 tests/core_getparams.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+)
 create mode 100644 tests/core_getparams.c

diff --git a/tests/.gitignore b/tests/.gitignore
index 7b4dd94..39b4e28 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,6 +1,7 @@
 # Please keep sorted alphabetically
 core_get_client_auth
 core_getclient
+core_getparams
 core_getstats
 core_getversion
 drm_import_export
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 51e8376..999c8f8 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
 
 TESTS_progs_M = \
 	core_get_client_auth \
+	core_getparams \
 	drv_suspend \
 	drv_hangman \
 	gem_bad_reloc \
diff --git a/tests/core_getparams.c b/tests/core_getparams.c
new file mode 100644
index 0000000..37a4f63
--- /dev/null
+++ b/tests/core_getparams.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jeff McGee <jeff.mcgee at intel.com>
+ *
+ */
+
+#include <unistd.h>
+#include <errno.h>
+#include <xf86drm.h>
+#include "drmtest.h"
+#include "intel_chipset.h"
+#include "intel_bufmgr.h"
+
+int drm_fd;
+int devid;
+
+static void
+init(void)
+{
+	drm_fd = drm_open_any();
+	devid = intel_get_drm_devid(drm_fd);
+}
+
+static void
+deinit(void)
+{
+	close(drm_fd);
+}
+
+static void
+subslice_total(void)
+{
+	unsigned int subslice_total = 0;
+	int ret;
+
+	ret = drm_intel_get_subslice_total(drm_fd, &subslice_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert(ret == -ENODEV);
+			igt_info("subslice total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert(ret != EINVAL); /* request not recognized? */
+			igt_assert(ret != ENODEV); /* device not supported? */
+			igt_assert(ret == 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert(subslice_total != 0);
+		igt_info("subslice total: %u\n", subslice_total);
+	}
+}
+
+static void
+eu_total(void)
+{
+	unsigned int eu_total = 0;
+	int ret;
+
+	ret = drm_intel_get_eu_total(drm_fd, &eu_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert(ret == -ENODEV);
+			igt_info("EU total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert(ret != EINVAL); /* request not recognized? */
+			igt_assert(ret != ENODEV); /* device not supported? */
+			igt_assert(ret == 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert(eu_total != 0);
+		igt_info("EU total: %u\n", eu_total);
+	}
+}
+
+static void
+exit_handler(int sig)
+{
+	deinit();
+}
+
+igt_main
+{
+	igt_fixture {
+		igt_install_exit_handler(exit_handler);
+		init();
+	}
+
+	igt_subtest("subslice-total")
+		subslice_total();
+
+	igt_subtest("eu-total")
+		eu_total();
+}
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:19:32 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:19:32 -0700
Subject: [Beignet] [PATCH i-g-t 2/2] configure: Bump required libdrm version
	to 2.4.60
In-Reply-To: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339642-18988-1-git-send-email-jeff.mcgee@intel.com>
 <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425943172-28040-2-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

tests/core_getparams needs the new libdrm interfaces for
querying subslice and EU counts.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 16d6a2e..88a1c3d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
 fi
 AC_SUBST(ASSEMBLER_WARN_CFLAGS)
 
-PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
+PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
 PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
 PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
 PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:30:06 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:30:06 -0700
Subject: [Beignet] [PATCH i-g-t 2/2] configure: Bump required libdrm version
	to 2.4.60
In-Reply-To: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425943806-28142-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

tests/core_getparams needs the new libdrm interfaces for
querying subslice and EU counts.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 16d6a2e..88a1c3d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
 fi
 AC_SUBST(ASSEMBLER_WARN_CFLAGS)
 
-PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
+PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
 PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
 PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
 PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:35:58 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:35:58 -0700
Subject: [Beignet] [PATCH 2/2 v2] Query the driver directly for compute
	units and subslice
In-Reply-To: <1425339759-19027-2-git-send-email-jeff.mcgee@intel.com>
References: <1425339759-19027-2-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425944158-28223-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

Values of device max compute units and max subslice obtained
directly from the driver should be more accurate than our own
ID-based lookup values. This is particularly important when a
single device ID may encompass more than one configuration. If
the driver cannot provide a valid value for the given device,
we fallback on the ID-based lookup value.

This query requires libdrm 2.4.60. For now we will consider
the use of this query to be optional and exclude it from
compilation when building against older libdrm. Later we may
want to consider requiring the query or at least warning
more strongly when it is not supported.

v2: Make feature use conditional on libdrm version (Zhigang).

Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 CMakeLists.txt           |  9 +++++++++
 src/CMakeLists.txt       | 10 ++++++++++
 src/intel/intel_driver.c | 25 +++++++++++++++++++++----
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65f2c70..bb03566 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,15 @@ IF(DRM_INTEL_FOUND)
   ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
     MESSAGE(STATUS "Disable userptr support")
   ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+    MESSAGE(STATUS "Enable EU total query support")
+    SET(DRM_INTEL_EU_TOTAL "enable")
+    MESSAGE(STATUS "Enable subslice total query support")
+    SET(DRM_INTEL_SUBSLICE_TOTAL "enable")
+  ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
+    MESSAGE(STATUS "Disable EU total query support")
+    MESSAGE(STATUS "Disable subslice total query support")
+  ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
 ELSE(DRM_INTEL_FOUND)
   MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4181d8..464765f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -118,6 +118,16 @@ SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
 endif (DRM_INTEL_USERPTR)
 
+if (DRM_INTEL_EU_TOTAL)
+SET(CMAKE_CXX_FLAGS "-DHAS_EU_TOTAL ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_EU_TOTAL ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_EU_TOTAL)
+
+if (DRM_INTEL_SUBSLICE_TOTAL)
+SET(CMAKE_CXX_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_SUBSLICE_TOTAL)
+
 set(GIT_SHA1 "git_sha1.h")
 add_custom_target(${GIT_SHA1} ALL
   COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index d61988c..755ab6b 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -757,10 +757,7 @@ static int intel_buffer_set_tiling(cl_buffer bo,
 static void
 intel_update_device_info(cl_device_id device)
 {
-#ifdef HAS_USERPTR
   intel_driver_t *driver;
-  const size_t sz = 4096;
-  void *host_ptr;
 
   driver = intel_driver_new();
   assert(driver != NULL);
@@ -769,6 +766,10 @@ intel_update_device_info(cl_device_id device)
     return;
   }
 
+#ifdef HAS_USERPTR
+  const size_t sz = 4096;
+  void *host_ptr;
+
   host_ptr = cl_aligned_malloc(sz, 4096);
   if (host_ptr != NULL) {
     cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
@@ -781,12 +782,28 @@ intel_update_device_info(cl_device_id device)
   }
   else
     device->host_unified_memory = CL_FALSE;
+#endif
+
+#ifdef HAS_EU_TOTAL
+  unsigned int eu_total;
+
+  /* Prefer driver-queried max compute units if supported */
+  if (!drm_intel_get_eu_total(driver->fd, &eu_total))
+    device->max_compute_unit = eu_total;
+#endif
+
+#ifdef HAS_SUBSLICE_TOTAL
+  unsigned int subslice_total;
+
+  /* Prefer driver-queried subslice count if supported */
+  if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
+    device->sub_slice_count = subslice_total;
+#endif
 
   intel_driver_context_destroy(driver);
   intel_driver_close(driver);
   intel_driver_terminate(driver);
   intel_driver_delete(driver);
-#endif
 }
 
 LOCAL void
-- 
2.3.0


From jeff.mcgee at intel.com  Mon Mar  9 16:41:02 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Mon,  9 Mar 2015 16:41:02 -0700
Subject: [Beignet] [PATCH i-g-t 2/2] configure: Bump required libdrm version
	to 2.4.60
In-Reply-To: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

tests/core_getparams needs the new libdrm interfaces for
querying subslice and EU counts.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 16d6a2e..88a1c3d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
 fi
 AC_SUBST(ASSEMBLER_WARN_CFLAGS)
 
-PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
+PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
 PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
 PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
 PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
-- 
2.3.0


From nanhai.zou at intel.com  Mon Mar  9 18:57:23 2015
From: nanhai.zou at intel.com (Zou, Nanhai)
Date: Tue, 10 Mar 2015 01:57:23 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150309120218.GD23680@nuc-i3427.alporthouse.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
Message-ID: <DF876E69000F0E4DB19B760E3EBA5C7501C6F469@SHSMSX103.ccr.corp.intel.com>

> -----Original Message-----
> From: Chris Wilson [mailto:chris at chris-wilson.co.uk]
> Sent: Monday, March 09, 2015 8:02 PM
> To: Zou, Nanhai
> Cc: Daniel Vetter; Song, Ruiling; Vetter, Daniel; intel-gfx at lists.freedesktop.org;
> Yang, Rong R; beignet at lists.freedesktop.org; Weinehall, David
> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address allocation
> 
> On Mon, Mar 09, 2015 at 02:34:46AM +0000, Zou, Nanhai wrote:
> > We don't need MAP_FIXED, we just want to avoid address 0 to be allocated.
> >
> > Though I think using MAP_FIXED is overkill, will bring much unnecessary
> complexity on both kernel and beignet side.
> > I don't mind if people can provide stable MAP_FIXED patches to resolve this
> problem a few months or years later.
> >
> > At that time, kernel driver can revert the reserve page 0 patch.
> > Before that reserve page 0 can benefit all the Beignet user without breaking
> anything.
> 
> The point is that is becomes ABI. So no the kernel can't just revert it.
> There is nothing special about address 0 in ether GTT or virtual memory.
> If you require a special object allocated at address 0, allocate a special object
> at address 0.
> -Chris


Hi,
	Zero page is not an ABI, It is only a strategy of virtual space allocation.
	Nobody would need exactly a page of virtual address 0. So there is no dependency.
	
	Zero page is a very common used method to avoid the NULL pointer issue.
	See http://en.wikipedia.org/wiki/Zero_page

	I can see a lot of issue to do that with MAP_FIXED zero page in user space.

	Say if a program combine libva, beignet and mesa or other graphics components together, 
	which component should be responsible of allocating this zero page?

	What if a component happened to allocate a page in offset 0 before beignet and release it later? 
	Using a zero page just remove the unnecessary mess and resolve problem in a clean explicit way.


Thanks
Zou Nanhai


> --
> Chris Wilson, Intel Open Source Technology Centre

From xionghu.luo at intel.com  Mon Mar  9 22:59:42 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:42 +0800
Subject: [Beignet] [PATCH 1/7] replace fabs with llvm intrinsic.
Message-ID: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

translate native fabs to llvm.fabs for fast path.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 2 --
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 2ed7b31..681e70c 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -23,7 +23,7 @@
 
 extern constant int __ocl_math_fastpath_flag;
 
-PURE CONST float __gen_ocl_fabs(float x);
+CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
 CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
 CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
 CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index c0ff1d1..a42ee40 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2673,7 +2673,6 @@ namespace gbe
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
-      case GEN_OCL_FABS:
       case GEN_OCL_RNDZ:
       case GEN_OCL_RNDE:
       case GEN_OCL_RNDU:
@@ -3077,7 +3076,6 @@ namespace gbe
           }
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
           case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
           case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
           case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 45358d0..8e37df9 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -19,7 +19,6 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
 DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 
 // Math function
-DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar  9 22:59:43 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:43 +0800
Subject: [Beignet] [PATCH 2/7] replace rndz with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425967188-22075-2-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

translate native rndz to llvm.trunc.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 2 --
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 681e70c..a3e29bb 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -32,7 +32,7 @@ CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
 CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
 PURE CONST float __gen_ocl_pow(float x, float y);
 PURE CONST float __gen_ocl_rcp(float x);
-PURE CONST float __gen_ocl_rndz(float x);
+CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
 PURE CONST float __gen_ocl_rnde(float x);
 PURE CONST float __gen_ocl_rndu(float x);
 PURE CONST float __gen_ocl_rndd(float x);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index a42ee40..288f6a3 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2673,7 +2673,6 @@ namespace gbe
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
-      case GEN_OCL_RNDZ:
       case GEN_OCL_RNDE:
       case GEN_OCL_RNDU:
       case GEN_OCL_RNDD:
@@ -3076,7 +3075,6 @@ namespace gbe
           }
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
           case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
           case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
           case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 8e37df9..057f4c8 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -22,7 +22,6 @@ DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz)
 DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
 DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
 DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar  9 22:59:44 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:44 +0800
Subject: [Beignet] [PATCH 3/7] replace rnde with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425967188-22075-3-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

translate native rnde to llvm.rint.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 4 ++--
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index a3e29bb..d07e5d4 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -33,7 +33,7 @@ CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
 PURE CONST float __gen_ocl_pow(float x, float y);
 PURE CONST float __gen_ocl_rcp(float x);
 CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
-PURE CONST float __gen_ocl_rnde(float x);
+CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
 PURE CONST float __gen_ocl_rndu(float x);
 PURE CONST float __gen_ocl_rndd(float x);
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 288f6a3..e358938 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2612,6 +2612,7 @@ namespace gbe
           case Intrinsic::ceil:
           case Intrinsic::fma:
           case Intrinsic::trunc:
+          case Intrinsic::rint:
           case Intrinsic::sin:
           case Intrinsic::cos:
           case Intrinsic::log2:
@@ -2673,7 +2674,6 @@ namespace gbe
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
-      case GEN_OCL_RNDE:
       case GEN_OCL_RNDU:
       case GEN_OCL_RNDD:
       case GEN_OCL_GET_IMAGE_WIDTH:
@@ -2999,6 +2999,7 @@ namespace gbe
           case Intrinsic::ceil: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
           case Intrinsic::fabs: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
           case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
+          case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
           case Intrinsic::sin: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
@@ -3075,7 +3076,6 @@ namespace gbe
           }
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
           case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
           case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
           case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 057f4c8..ea75678 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -22,7 +22,6 @@ DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
 DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
 DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
 DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar  9 22:59:45 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:45 +0800
Subject: [Beignet] [PATCH 4/7] replace rndu with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425967188-22075-4-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

translate native rndu to llvm.ceil.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 2 --
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index d07e5d4..b3288b6 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -34,7 +34,7 @@ PURE CONST float __gen_ocl_pow(float x, float y);
 PURE CONST float __gen_ocl_rcp(float x);
 CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
 CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
-PURE CONST float __gen_ocl_rndu(float x);
+CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
 PURE CONST float __gen_ocl_rndd(float x);
 
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index e358938..02d5d37 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2674,7 +2674,6 @@ namespace gbe
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
-      case GEN_OCL_RNDU:
       case GEN_OCL_RNDD:
       case GEN_OCL_GET_IMAGE_WIDTH:
       case GEN_OCL_GET_IMAGE_HEIGHT:
@@ -3076,7 +3075,6 @@ namespace gbe
           }
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
           case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
           case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
           case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index ea75678..83bd504 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -22,7 +22,6 @@ DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
 DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
 DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
 DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar  9 22:59:46 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:46 +0800
Subject: [Beignet] [PATCH 5/7] replace rndd with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425967188-22075-5-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

translate native rndd to llvm.floor.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 4 ++--
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index b3288b6..40b6401 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -35,7 +35,7 @@ PURE CONST float __gen_ocl_rcp(float x);
 CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
 CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
 CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
-PURE CONST float __gen_ocl_rndd(float x);
+CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
 
 
 /* native functions */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 02d5d37..6549950 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2613,6 +2613,7 @@ namespace gbe
           case Intrinsic::fma:
           case Intrinsic::trunc:
           case Intrinsic::rint:
+          case Intrinsic::floor:
           case Intrinsic::sin:
           case Intrinsic::cos:
           case Intrinsic::log2:
@@ -2674,7 +2675,6 @@ namespace gbe
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
-      case GEN_OCL_RNDD:
       case GEN_OCL_GET_IMAGE_WIDTH:
       case GEN_OCL_GET_IMAGE_HEIGHT:
       case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
@@ -2999,6 +2999,7 @@ namespace gbe
           case Intrinsic::fabs: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
           case Intrinsic::trunc: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
           case Intrinsic::rint: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
+          case Intrinsic::floor: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
           case Intrinsic::sin: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case Intrinsic::cos: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case Intrinsic::log2: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
@@ -3075,7 +3076,6 @@ namespace gbe
           }
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
-          case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
           case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
           case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
           case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 83bd504..2cc63bd 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -22,7 +22,6 @@ DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
 DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
 DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
 DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar  9 22:59:47 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:47 +0800
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

translate native mad to llvm.fma.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 9 ---------
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 40b6401..d9e677b 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -2616,7 +2616,7 @@ OVERLOADABLE float ldexp(float x, int n) {
   return __gen_ocl_internal_ldexp(x, n);
 }
 
-PURE CONST float __gen_ocl_mad(float a, float b, float c);
+CONST float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
 PURE CONST float __gen_ocl_fmax(float a, float b);
 PURE CONST float __gen_ocl_fmin(float a, float b);
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 6549950..d9ac6e0 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2738,7 +2738,6 @@ namespace gbe
       case GEN_OCL_UPSAMPLE_SHORT:
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
-      case GEN_OCL_MAD:
       case GEN_OCL_FMAX:
       case GEN_OCL_FMIN:
       case GEN_OCL_SADD_SAT_CHAR:
@@ -3323,14 +3322,6 @@ namespace gbe
             ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
             break;
            }
-          case GEN_OCL_MAD: {
-            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
-            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
-            const ir::Register dst = this->getRegister(&I);
-            ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
-            break;
-          }
           case GEN_OCL_FMAX:
           case GEN_OCL_FMIN:{
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 2cc63bd..5f5451c 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -22,7 +22,6 @@ DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
-DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
 DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
 DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
 
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar  9 22:59:48 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 10 Mar 2015 13:59:48 +0800
Subject: [Beignet] [PATCH 7/7]  replace pow with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1425967188-22075-7-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

 translate native pow to llvm.pow for fast path.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/libocl/tmpl/ocl_math.tmpl.cl   |  2 +-
 backend/src/llvm/llvm_gen_backend.cpp      | 18 +++++++++---------
 backend/src/llvm/llvm_gen_ocl_function.hxx |  1 -
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index d9e677b..da5b9a9 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -30,7 +30,7 @@ CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
 PURE CONST float __gen_ocl_rsqrt(float x);
 CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
 CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
-PURE CONST float __gen_ocl_pow(float x, float y);
+PURE CONST float __gen_ocl_pow(float x, float y) __asm("llvm.pow" ".f32");
 PURE CONST float __gen_ocl_rcp(float x);
 CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
 CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d9ac6e0..773300b 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2618,6 +2618,7 @@ namespace gbe
           case Intrinsic::cos:
           case Intrinsic::log2:
           case Intrinsic::exp2:
+          case Intrinsic::pow:
             this->newRegister(&I);
           break;
           default:
@@ -2672,7 +2673,6 @@ namespace gbe
       case GEN_OCL_FBL:
       case GEN_OCL_CBIT:
       case GEN_OCL_RSQ:
-      case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
       case GEN_OCL_GET_IMAGE_WIDTH:
@@ -3005,6 +3005,14 @@ namespace gbe
           case Intrinsic::exp2: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
           case Intrinsic::bswap:
             this->emitUnaryCallInst(I,CS,ir::OP_BSWAP, getUnsignedType(ctx, I.getType())); break;
+          case Intrinsic::pow:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
+            break;
+          }
           default: NOT_IMPLEMENTED;
         }
       } else {
@@ -3021,14 +3029,6 @@ namespace gbe
 #endif /* GBE_DEBUG */
 
         switch (genIntrinsicID) {
-          case GEN_OCL_POW:
-          {
-            const ir::Register src0 = this->getRegister(*AI); ++AI;
-            const ir::Register src1 = this->getRegister(*AI);
-            const ir::Register dst = this->getRegister(&I);
-            ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
-            break;
-          }
           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
           case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT, getUnsignedType(ctx, (*AI)->getType())); break;
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 5f5451c..9536a3c 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -20,7 +20,6 @@ DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 
 // Math function
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
-DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
 DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
 DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
-- 
1.9.1


From junyan.he at inbox.com  Tue Mar 10 00:04:37 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Tue, 10 Mar 2015 15:04:37 +0800
Subject: [Beignet] [PATCH 1/2] Backend: Add the logic to handle uniform src
	for BSwap Gen8.
Message-ID: <1425971077-5781-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen8_context.cpp | 90 +++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 28 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 0d4a40e..3f57cf6 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -117,7 +117,8 @@ namespace gbe
             }
           } else {
             if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
-              GBE_ASSERT(src.subnr == 0);
+              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+              GBE_ASSERT(uniform_src || src.subnr == 0);
               GBE_ASSERT(dst.subnr == 0);
               GBE_ASSERT(tmp.subnr == 0);
               GBE_ASSERT(start_addr >= 0);
@@ -125,18 +126,33 @@ namespace gbe
               new_a0[1] = start_addr + 2;
               new_a0[2] = start_addr + 1;
               new_a0[3] = start_addr;
-              new_a0[4] = start_addr + 7;
-              new_a0[5] = start_addr + 6;
-              new_a0[6] = start_addr + 5;
-              new_a0[7] = start_addr + 4;
-              new_a0[8] = start_addr + 11;
-              new_a0[9] = start_addr + 10;
-              new_a0[10] = start_addr + 9;
-              new_a0[11] = start_addr + 8;
-              new_a0[12] = start_addr + 15;
-              new_a0[13] = start_addr + 14;
-              new_a0[14] = start_addr + 13;
-              new_a0[15] = start_addr + 12;
+              if (!uniform_src) {
+                new_a0[4] = start_addr + 7;
+                new_a0[5] = start_addr + 6;
+                new_a0[6] = start_addr + 5;
+                new_a0[7] = start_addr + 4;
+                new_a0[8] = start_addr + 11;
+                new_a0[9] = start_addr + 10;
+                new_a0[10] = start_addr + 9;
+                new_a0[11] = start_addr + 8;
+                new_a0[12] = start_addr + 15;
+                new_a0[13] = start_addr + 14;
+                new_a0[14] = start_addr + 13;
+                new_a0[15] = start_addr + 12;
+              } else {
+                new_a0[4] = start_addr + 3;
+                new_a0[5] = start_addr + 2;
+                new_a0[6] = start_addr + 1;
+                new_a0[7] = start_addr;
+                new_a0[8] = start_addr + 3;
+                new_a0[9] = start_addr + 2;
+                new_a0[10] = start_addr + 1;
+                new_a0[11] = start_addr;
+                new_a0[12] = start_addr + 3;
+                new_a0[13] = start_addr + 2;
+                new_a0[14] = start_addr + 1;
+                new_a0[15] = start_addr;
+              }
               this->setA0Content(new_a0, 48);
 
               p->push();
@@ -158,26 +174,44 @@ namespace gbe
 
               p->MOV(dst, tmp);
             } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
-              GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
+              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
+              GBE_ASSERT(uniform_src || src.subnr == 0 || src.subnr == 16);
               GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
               GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
               GBE_ASSERT(start_addr >= 0);
               new_a0[0] = start_addr + 1;
               new_a0[1] = start_addr;
-              new_a0[2] = start_addr + 3;
-              new_a0[3] = start_addr + 2;
-              new_a0[4] = start_addr + 5;
-              new_a0[5] = start_addr + 4;
-              new_a0[6] = start_addr + 7;
-              new_a0[7] = start_addr + 6;
-              new_a0[8] = start_addr + 9;
-              new_a0[9] = start_addr + 8;
-              new_a0[10] = start_addr + 11;
-              new_a0[11] = start_addr + 10;
-              new_a0[12] = start_addr + 13;
-              new_a0[13] = start_addr + 12;
-              new_a0[14] = start_addr + 15;
-              new_a0[15] = start_addr + 14;
+              if (!uniform_src) {
+                new_a0[2] = start_addr + 3;
+                new_a0[3] = start_addr + 2;
+                new_a0[4] = start_addr + 5;
+                new_a0[5] = start_addr + 4;
+                new_a0[6] = start_addr + 7;
+                new_a0[7] = start_addr + 6;
+                new_a0[8] = start_addr + 9;
+                new_a0[9] = start_addr + 8;
+                new_a0[10] = start_addr + 11;
+                new_a0[11] = start_addr + 10;
+                new_a0[12] = start_addr + 13;
+                new_a0[13] = start_addr + 12;
+                new_a0[14] = start_addr + 15;
+                new_a0[15] = start_addr + 14;
+              } else {
+                new_a0[2] = start_addr + 1;
+                new_a0[3] = start_addr;
+                new_a0[4] = start_addr + 1;
+                new_a0[5] = start_addr;
+                new_a0[6] = start_addr + 1;
+                new_a0[7] = start_addr;
+                new_a0[8] = start_addr + 1;
+                new_a0[9] = start_addr;
+                new_a0[10] = start_addr + 1;
+                new_a0[11] = start_addr;
+                new_a0[12] = start_addr + 1;
+                new_a0[13] = start_addr;
+                new_a0[14] = start_addr + 1;
+                new_a0[15] = start_addr;
+              }
               this->setA0Content(new_a0, 48);
 
               p->push();
-- 
2.1.0


From junyan.he at inbox.com  Tue Mar 10 00:08:52 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Tue, 10 Mar 2015 15:08:52 +0800
Subject: [Beignet] [PATCH 2/2] Backend: Fix errors in disasm for indirect
	instruction Gen8.
Message-ID: <1425971332-5866-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index e32aae6..711b943 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -143,7 +143,7 @@ static const char *_abs[2] = {
   [1] = "(abs)",
 };
 
-static const char *vert_stride_gen7[16] = {
+static const char *vert_stride[16] = {
   [0] = "0",
   [1] = "1",
   [2] = "2",
@@ -153,15 +153,6 @@ static const char *vert_stride_gen7[16] = {
   [6] = "32",
   [15] = "VxH",
 };
-static const char *vert_stride_gen8[16] = {
-  [0] = "0",
-  [1] = "1",
-  [2] = "2",
-  [3] = "4",
-  [4] = "8",
-  [5] = "16",
-  [6] = "32",
-};
 
 static const char *width[8] = {
   [0] = "1",
@@ -717,11 +708,7 @@ static int src_align1_region(FILE *file,
 {
   int err = 0;
   string(file, "<");
-  if (gen_version < 80) {
-    err |= control(file, "vert stride", vert_stride_gen7, _vert_stride, NULL);
-  } else {
-    err |= control(file, "vert stride", vert_stride_gen8, _vert_stride, NULL);
-  }
+  err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
   string(file, ",");
   err |= control(file, "width", width, _width, NULL);
   string(file, ",");
@@ -801,11 +788,7 @@ static int src_da16(FILE *file,
     format(file, ".%d", 16 / reg_type_size[_reg_type]);
   string(file, "<");
 
-  if (gen_version < 80) {
-    err |= control(file, "vert stride", vert_stride_gen7, _vert_stride, NULL);
-  } else {
-    err |= control(file, "vert stride", vert_stride_gen8, _vert_stride, NULL);
-  }
+  err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
   string(file, ",4,1>");
   /*
    * Three kinds of swizzle display:
@@ -1022,10 +1005,15 @@ static int src0(FILE *file, const void* inst)
                      GEN_BITS_FIELD(inst, bits2.da1.src0_abs),
                      GEN_BITS_FIELD(inst, bits2.da1.src0_negate));
     } else {
+      int32_t imm_off = GEN_BITS_FIELD(inst, bits2.ia1.src0_indirect_offset);
+      if (gen_version >= 80) {
+        imm_off = imm_off +
+          ((((const union Gen8NativeInstruction *)inst)->bits2.ia1.src0_indirect_offset_9) << 9);
+      }
       return src_ia1(file,
                      GEN_BITS_FIELD(inst, bits1.ia1.src0_reg_type),
                      GEN_BITS_FIELD(inst, bits1.ia1.src0_reg_file),
-                     GEN_BITS_FIELD(inst, bits2.ia1.src0_indirect_offset),
+                     imm_off,
                      GEN_BITS_FIELD(inst, bits2.ia1.src0_subreg_nr),
                      GEN_BITS_FIELD(inst, bits2.ia1.src0_negate),
                      GEN_BITS_FIELD(inst, bits2.ia1.src0_abs),
-- 
2.1.0


From daniel at ffwll.ch  Tue Mar 10 00:37:30 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Tue, 10 Mar 2015 08:37:30 +0100
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150310073730.GA3800@phenom.ffwll.local>

On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> tests/core_getparams needs the new libdrm interfaces for
> querying subslice and EU counts.
> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  configure.ac | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/configure.ac b/configure.ac
> index 16d6a2e..88a1c3d 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
>  fi
>  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
>  
> -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])

Please don't and instead copypaste the new structs/defines with a local_
prefix like we do it for all the other new igt testcases. Forcing libdrm
to get updated for igt all the time can get annoying fast.
-Daniel

>  PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
>  PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
>  PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
> -- 
> 2.3.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From zhigang.gong at linux.intel.com  Tue Mar 10 02:19:33 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 10 Mar 2015 17:19:33 +0800
Subject: [Beignet] double precision support
In-Reply-To: <54F86D69.9080209@gmail.com>
References: <54F86D69.9080209@gmail.com>
Message-ID: <20150310091932.GF20578@ivb-gt2-rev4>

On Thu, Mar 05, 2015 at 03:51:21PM +0100, Malcolm Roberts wrote:
> Hello.
> 
> I just installed the latest version of beignet from git
> (74390cabe5d2958fec5806a5099bad44c74798f5) and I notice that
> cl_khr_fp64 is not available.  I am a bit late for the conversation
> in July
> (http://lists.freedesktop.org/archives/beignet/2014-July/003599.html)
> but I thought that I would add my voice.
> 
> I have a project to do scientific computation using the OpenCL
> platform, and double precision is vital for my work.  I understand
> that the beignet teams has limited resources and that one must
> prioritize, but I feel that double-precision would be a great
> addition.

So far, the double precision support is still not a high priority
task for Beignet. The two major reasons we already discussed in
the above thread remain unchanged now. 
1. Don't have many requirement from the community.
2. The double support is not fully supported. For example, all the math
   functions and even the divide instruction is not supported.

That's why we still haven't started to implement double support.

Thanks,
Zhigang.

> 
> Best,
> 
> ~Malcolm Roberts
> http://malcolmiwroberts.com/
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From chuanbo.weng at intel.com  Tue Mar 10 08:04:02 2015
From: chuanbo.weng at intel.com (Chuanbo Weng)
Date: Tue, 10 Mar 2015 23:04:02 +0800
Subject: [Beignet] [PATCH] Add example to show v4l2 buffer sharing with
	extension clGetMemObjectFdIntel.
Message-ID: <1425999842-9386-1-git-send-email-chuanbo.weng@intel.com>

This example captures yuy2 frame directly to cl buffer
object by the way of dma, processed by OpenCL kernel,
then convert to nv12 format and shown by libva.

Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 CMakeLists.txt                                     |  35 +-
 examples/CMakeLists.txt                            |  29 +-
 .../v4l2_buffer_sharing/v4l2_buffer_sharing.cpp    | 571 +++++++++++++++++++++
 kernels/runtime_yuy2_processing.cl                 |  15 +
 4 files changed, 626 insertions(+), 24 deletions(-)
 create mode 100644 examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
 create mode 100644 kernels/runtime_yuy2_processing.cl

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c398e9..3674dc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,23 +186,30 @@ IF(BUILD_EXAMPLES)
 IF(NOT X11_FOUND)
   MESSAGE(FATAL_ERROR "XLib is necessary for examples - not found")
 ENDIF(NOT X11_FOUND)
-# libva
-pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
-IF(LIBVA_FOUND)
+# libva & libva-x11
+#pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
+pkg_check_modules(LIBVA REQUIRED libva)
+pkg_check_modules(LIBVA-X11 REQUIRED libva-x11)
+set(LIBVA_BUF_SH_DEP false)
+set(V4L2_BUF_SH_DEP false)
+IF(LIBVA_FOUND AND LIBVA-X11_FOUND)
   MESSAGE(STATUS "Looking for LIBVA - found at ${LIBVA_PREFIX} ${LIBVA_VERSION}")
-  INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS})
-ELSE(LIBVA_FOUND)
-  MESSAGE(STATUS "Looking for LIBVA (>= 0.36.0) - not found")
-ENDIF(LIBVA_FOUND)
-
-# libva-x11
-pkg_check_modules(LIBVA-X11 REQUIRED libva-x11>=0.36.0)
-IF(LIBVA-X11_FOUND)
   MESSAGE(STATUS "Looking for LIBVA-X11 - found at ${LIBVA-X11_PREFIX} ${LIBVA-X11_VERSION}")
+  INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS})
   INCLUDE_DIRECTORIES(${LIBVA-X11_INCLUDE_DIRS})
-ELSE(LIBVA-X11_FOUND)
-  MESSAGE(STATUS "Looking for LIBVA-X11 (>= 0.36.0) - not found")
-ENDIF(LIBVA-X11_FOUND)
+  set(V4L2_BUF_SH_DEP true)
+  IF(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    IF(LIBVA_VERSION VERSION_LESS "0.36.0")
+      MESSAGE(STATUS "Looking for LIBVA (>= 0.36.0) - not found")
+    ENDIF(LIBVA_VERSION VERSION_LESS "0.36.0")
+    IF(LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+      MESSAGE(STATUS "Looking for LIBVA-X11 (>= 0.36.0) - not found")
+    ENDIF(LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    MESSAGE(STATUS "Example libva_buffer_sharing will not be built")
+  ELSE(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    set(LIBVA_BUF_SH_DEP true)
+  ENDIF(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+ENDIF(LIBVA_FOUND AND LIBVA-X11_FOUND)
 ENDIF(BUILD_EXAMPLES)
 
 ADD_SUBDIRECTORY(include)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 904f259..ab31fe7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,3 +1,9 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+                    ${X11_INCLUDE_DIR})
+
+IF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP)
 EXEC_PROGRAM(ls ARGS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" OUTPUT_VARIABLE LS_OUTPUT)
 IF(NOT LS_OUTPUT)
 EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/.." ARGS "submodule init")
@@ -5,17 +11,13 @@ EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/.." ARGS "submodule update")
 EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" ARGS "checkout master")
 ENDIF(NOT LS_OUTPUT)
 
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
-                    ${CMAKE_CURRENT_SOURCE_DIR}/../utests
-                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
-                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va
-                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common
-                    ${X11_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va
+                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common)
 
 link_directories (${LIBVA_LIBDIR}
                   ${LIBVA-X11_LIBDIR})
 
-set (examples_sources
+set (va_ocl_basic_sources
   ../utests/utest_error.c
   ../utests/utest_assert.cpp
   ../utests/utest_file_map.cpp
@@ -23,13 +25,20 @@ set (examples_sources
   ./thirdparty/libva/test/common/va_display.c
   ./thirdparty/libva/test/common/va_display_x11.c)
 
-
 ADD_DEFINITIONS(-DHAVE_VA_X11)
-ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT="${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12")
 
-ADD_LIBRARY(va_ocl_basic SHARED ${examples_sources})
+ADD_LIBRARY(va_ocl_basic SHARED ${va_ocl_basic_sources})
 
 TARGET_LINK_LIBRARIES(va_ocl_basic cl m va va-x11 ${X11_X11_LIB})
 
+IF(LIBVA_BUF_SH_DEP)
+ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT="${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12")
 ADD_EXECUTABLE(example-libva_buffer_sharing ./libva_buffer_sharing/libva_buffer_sharing.cpp)
 TARGET_LINK_LIBRARIES(example-libva_buffer_sharing va_ocl_basic)
+ENDIF(LIBVA_BUF_SH_DEP)
+
+IF(V4L2_BUF_SH_DEP)
+ADD_EXECUTABLE(example-v4l2_buffer_sharing ./v4l2_buffer_sharing/v4l2_buffer_sharing.cpp)
+TARGET_LINK_LIBRARIES(example-v4l2_buffer_sharing va_ocl_basic)
+ENDIF(V4L2_BUF_SH_DEP)
+ENDIF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP)
diff --git a/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp b/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
new file mode 100644
index 0000000..bf0dbdf
--- /dev/null
+++ b/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
@@ -0,0 +1,571 @@
+/*
+ ** Copyright (c) 2012, 2015 Intel Corporation. All Rights Reserved.
+ **
+ ** Permission is hereby granted, free of charge, to any person obtaining a
+ ** copy of this software and associated documentation files (the
+ ** "Software"), to deal in the Software without restriction, including
+ ** without limitation the rights to use, copy, modify, merge, publish,
+ ** distribute, sub license, and/or sell copies of the Software, and to
+ ** permit persons to whom the Software is furnished to do so, subject to
+ ** the following conditions:
+ **
+ ** The above copyright notice and this permission notice (including the
+ ** next paragraph) shall be included in all copies or substantial portions
+ ** of the Software.
+ **
+ ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ ** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ ** IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ ** ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ **/
+
+#include <getopt.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/videodev2.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <inttypes.h>
+#include <ctype.h>
+
+#include <va/va.h>
+#include <va/va_drmcommon.h>
+
+#include "va_display.h"
+#include "utest_helper.hpp"
+
+using namespace std;
+
+#define BUFFER_NUM_DEFAULT 5
+#define VIDEO_NODE_DEFAULT "/dev/video0"
+#define WIDTH_DEFAULT 640
+#define HEIGHT_DEFAULT 480
+
+#define CHECK_VASTATUS(va_status,func)                                  \
+  if (va_status != VA_STATUS_SUCCESS) {                                   \
+    fprintf(stderr, "status = %d, %s: %s(line %d) failed, exit\n",va_status, __func__, func, __LINE__); \
+    exit(1);                                                            \
+  }
+
+#define CHECK_CLSTATUS(status,func)                                  \
+  if (status != CL_SUCCESS) {                                   \
+    fprintf(stderr, "status = %d, %s: %s(line %d) failed, exit\n", status, __func__, func, __LINE__); \
+    exit(1);                                                            \
+  }
+
+#define CHECK_V4L2ERROR(ret, STR)                               \
+  if (ret){                             \
+    fprintf(stderr, STR);            \
+    perror(" ");                            \
+    fprintf(stderr, "ret = %d, %s: %s(line %d) failed, exit\n", ret, __func__, STR, __LINE__);      \
+    exit(1);                                  \
+  }
+
+VADisplay	va_dpy;
+cl_int cl_status;
+VAStatus va_status;
+VASurfaceID nv12_surface_id;
+VAImage nv12_image;
+
+int dev_fd;
+uint64_t image_size;
+unsigned int pitch;
+cl_int status;  
+cl_mem *input_buffer = NULL;
+typedef cl_int (OCLGETMEMOBJECTFD)(cl_context, cl_mem, int *);
+OCLGETMEMOBJECTFD *oclGetMemObjectFd = NULL;
+
+int frame_count = 0;
+struct v4l2_options{
+  const char *dev_name;
+  unsigned int width, height;
+  unsigned int spec_res;
+  unsigned int buffer_num;
+  unsigned int do_list;
+} vo;
+int *import_buf_fd = NULL;
+
+static const char short_options[] = "d:r:b:lh";
+
+static const struct option
+long_options[] = {
+  { "device", required_argument, NULL, 'd' },
+  { "help",   no_argument,       NULL, 'h' },
+  { "resolution", required_argument,       NULL, 'r' },
+  { "buffer_num",  required_argument, NULL, 'b' },
+  { "list",  no_argument, NULL, 'l' },
+  { 0, 0, 0, 0 }
+};
+
+static void usage(FILE *fp, int argc, char **argv)
+{
+  fprintf(fp,
+      "Usage: %s [options]\n\n"
+      "Options:\n"
+      "-d | --device=<dev>  Specify device by <dev> instead of /dev/video0\n"
+      "-h | --help          Print this message\n"
+      "-r | --resolution=<width,height>    Set image resolution\n"
+      "-b | --buffer_num=<num>  Set number of buffers\n"
+      "-l | --list  List available resolution of format 'V4L2_PIX_FMT_YUYV'\n"
+      "",
+      argv[0]);
+}
+
+static void list_resolution(){
+  int ret;
+  struct v4l2_capability cap;
+  struct v4l2_frmsizeenum frm_sz;
+
+  dev_fd = open(vo.dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (dev_fd < 0) {
+    fprintf(stderr, "Can not open %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+
+  memset(&cap, 0, sizeof(cap));
+  ret = ioctl(dev_fd, VIDIOC_QUERYCAP, &cap);
+  CHECK_V4L2ERROR(ret, "VIDIOC_QUERYCAP");
+
+  if(!(cap.capabilities & V4L2_CAP_VIDEO_CAPTURE)){
+    fprintf(stderr, "The device is not video capture device\n");
+    exit(1);
+  }
+  if(!(cap.capabilities & V4L2_CAP_STREAMING)){
+    fprintf(stderr, "The device does not support streaming i/o\n");
+    exit(1);
+  }
+
+  printf("Supported resolution under pixel format 'V4L2_PIX_FMT_YUYV':\n");
+  frm_sz.pixel_format = V4L2_PIX_FMT_YUYV;
+  frm_sz.index = 0;
+  bool extra_info = true;
+  while (ioctl(dev_fd, VIDIOC_ENUM_FRAMESIZES, &frm_sz) == 0) {
+    if (frm_sz.type == V4L2_FRMSIZE_TYPE_DISCRETE) {
+      if(extra_info){
+        printf("(width, height) = \n");
+        extra_info = false;
+      }
+      printf("(%d, %d)", frm_sz.discrete.width, frm_sz.discrete.height);
+      printf("\n");
+    }
+    else if (frm_sz.type == V4L2_FRMSIZE_TYPE_STEPWISE) {
+      printf("(width, height) from (%d, %d) to (%d, %d) with step (%d, %d)",
+          frm_sz.stepwise.min_width,
+          frm_sz.stepwise.min_height,
+          frm_sz.stepwise.max_width,
+          frm_sz.stepwise.max_height,
+          frm_sz.stepwise.step_width,
+          frm_sz.stepwise.step_height);
+      continue;
+    }
+    frm_sz.index++;
+  }
+
+  ret = close(dev_fd);
+  if (ret) {
+    fprintf(stderr, "Failed to close %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+}
+
+static void analyse_args(int argc, char *argv[])
+{
+  vo.dev_name = NULL;
+  vo.width = 0;
+  vo.height = 0;
+  vo.spec_res = 0;
+  vo.buffer_num = BUFFER_NUM_DEFAULT;
+  vo.do_list = 0;
+
+  int c, idx;
+  for (;;) {
+
+    c = getopt_long(argc, argv,
+        short_options, long_options, &idx);
+
+    if (-1 == c)
+      break;
+
+    switch (c) {
+      case 0:
+        break;
+
+      case 'd':
+        vo.dev_name = optarg;
+        break;
+
+      case '?':
+      case 'h':
+        usage(stdout, argc, argv);
+        exit(0);
+
+      case 'r':
+        sscanf(optarg, "%d,%d", &vo.width, &vo.height);
+        vo.spec_res = 1;
+        break;
+
+      case 'b':
+        vo.buffer_num = strtoul(optarg, NULL, 0);
+        break;
+
+      case 'l':
+        vo.do_list = 1;
+        break;
+
+      default:
+        usage(stderr, argc, argv);
+        exit(1);
+    }
+  }
+
+  if(!vo.dev_name){
+    printf("Haven't specified device, use default device: %s\n",
+        VIDEO_NODE_DEFAULT);
+  }
+  if(!vo.dev_name)
+    vo.dev_name = VIDEO_NODE_DEFAULT;
+  if(vo.do_list){
+    list_resolution();
+    exit(0);
+  }
+  if(!vo.spec_res){
+    printf("Haven't specified resolution, use default resolution: (width,height) = (%d, %d)\n",
+        WIDTH_DEFAULT, HEIGHT_DEFAULT);
+    vo.width = WIDTH_DEFAULT;
+    vo.height = HEIGHT_DEFAULT;
+  }
+  return;
+}
+
+static void initialize_va_ocl(){
+  int major_ver, minor_ver;
+
+  printf("\n***********************libva info: ***********************\n");
+  fflush(stdout);
+  va_dpy = va_open_display();
+  va_status = vaInitialize(va_dpy, &major_ver, &minor_ver);
+  CHECK_VASTATUS(va_status, "vaInitialize");
+
+  VASurfaceAttrib forcc;
+  forcc.type =VASurfaceAttribPixelFormat;
+  forcc.flags=VA_SURFACE_ATTRIB_SETTABLE;
+  forcc.value.type=VAGenericValueTypeInteger;
+  forcc.value.value.i = VA_FOURCC_NV12;
+  va_status = vaCreateSurfaces(va_dpy, VA_RT_FORMAT_YUV420,
+                               vo.width, vo.height,
+                               &nv12_surface_id, 1, &forcc, 1);
+  CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+  VAImageFormat image_fmt;
+  image_fmt.fourcc = VA_FOURCC_NV12;
+  image_fmt.byte_order = VA_LSB_FIRST;
+  image_fmt.bits_per_pixel = 12;
+  va_status = vaCreateImage(va_dpy, &image_fmt, vo.width, vo.height, &nv12_image);
+  CHECK_VASTATUS(va_status, "vaCreateImage");
+
+  //ocl initialization: basic & create kernel & get extension
+  printf("\n***********************OpenCL info: ***********************\n");
+  if ((cl_status = cl_test_init("runtime_yuy2_processing.cl", "runtime_yuy2_processing", SOURCE)) != 0){
+    fprintf(stderr, "cl_test_init error\n");
+    exit(1);
+  }
+
+#ifdef CL_VERSION_1_2
+  oclGetMemObjectFd = (OCLGETMEMOBJECTFD *)clGetExtensionFunctionAddressForPlatform(platform, "clGetMemObjectFdIntel");
+#else
+  oclGetMemObjectFd = (OCLGETMEMOBJECTFD *)clGetExtensionFunctionAddress("clGetMemObjectFdIntel");
+#endif
+  if(!oclGetMemObjectFd){
+    fprintf(stderr, "Failed to get extension clGetMemObjectFdIntel\n");
+    exit(1);
+  }
+  printf("\n***********************************************************\n");
+}
+
+static void create_dmasharing_buffers()
+{
+  if(import_buf_fd == NULL)
+    import_buf_fd = (int *)malloc(sizeof(int) * vo.buffer_num);
+  if(input_buffer == NULL){
+    input_buffer = (cl_mem *)malloc(sizeof(cl_mem) * vo.buffer_num);
+  }
+
+  for (unsigned int i = 0; i < vo.buffer_num; ++i){
+    input_buffer[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE, image_size, NULL, &cl_status);
+
+    //get cl buffer object's fd
+    cl_status = oclGetMemObjectFd(ctx, input_buffer[i], &import_buf_fd[i]);
+    CHECK_CLSTATUS(cl_status, "clGetMemObjectFdIntel");
+  }
+}
+
+static void release_va_ocl(){
+  vaDestroySurfaces(va_dpy,&nv12_surface_id,1);
+  vaDestroyImage(va_dpy, nv12_image.image_id);
+  vaTerminate(va_dpy);
+  va_close_display(va_dpy);
+
+  for (unsigned int i = 0; i < vo.buffer_num; ++i) {
+    close(import_buf_fd[i]);
+    clReleaseMemObject(input_buffer[i]);
+  }
+}
+
+static void process_show_frame(int index)
+{
+  //process input_buffer[index] by ocl
+  size_t global_size[2];
+  global_size[0] = vo.width * 2 / 4;
+  global_size[1] = vo.height;
+  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &input_buffer[index]);
+  OCL_CALL (clSetKernelArg, kernel, 1, sizeof(int), &vo.height);
+  OCL_CALL (clSetKernelArg, kernel, 2, sizeof(int), &pitch);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL,
+            global_size, NULL, 0, NULL, NULL);
+  OCL_CALL (clFinish, queue);
+  printf("\nSuccessfully use ocl to do processing...\n");
+
+  //create corresponding VASurface
+  VASurfaceID yuy2_surface_id;
+  VASurfaceAttrib sa[2];
+  sa[0].type = VASurfaceAttribMemoryType;
+  sa[0].flags = VA_SURFACE_ATTRIB_SETTABLE;
+  sa[0].value.type = VAGenericValueTypeInteger;
+  sa[0].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+  sa[1].type = VASurfaceAttribExternalBufferDescriptor;
+  sa[1].flags = VA_SURFACE_ATTRIB_SETTABLE;
+  sa[1].value.type = VAGenericValueTypePointer;
+  VASurfaceAttribExternalBuffers sa_eb;
+  sa_eb.pixel_format = VA_FOURCC_YUY2;
+  sa_eb.width = vo.width;
+  sa_eb.height = vo.height;
+  sa_eb.data_size = image_size;
+  sa_eb.num_planes = 1;
+  sa_eb.pitches[0] = pitch;
+  sa_eb.offsets[0] = 0; 
+  sa_eb.num_buffers = 1;
+  sa_eb.buffers = (unsigned long *)malloc(sizeof(unsigned long) * sa_eb.num_buffers);
+  sa_eb.buffers[0] = import_buf_fd[index];
+  sa_eb.flags = 0;
+  sa[1].value.value.p = &sa_eb;
+  va_status = vaCreateSurfaces(va_dpy, VA_RT_FORMAT_YUV422, 
+      vo.width, vo.height,
+      &yuy2_surface_id, 1, sa, 2);
+
+  //convert to NV12 format
+  va_status = vaGetImage (va_dpy, yuy2_surface_id, 0, 0, vo.width, vo.height, nv12_image.image_id);
+  CHECK_VASTATUS(va_status, "vaGetImage");
+  va_status = vaPutImage(va_dpy, nv12_surface_id, nv12_image.image_id, 0, 0, vo.width, vo.height, 0, 0, vo.width, vo.height);
+  CHECK_VASTATUS(va_status, "vaPutImage");
+  
+  //show by vaPutsurface
+  VARectangle src_rect, dst_rect;
+  src_rect.x      = 0;
+  src_rect.y      = 0;
+  src_rect.width  = vo.width;
+  src_rect.height = vo.height;
+  dst_rect        = src_rect;
+  va_status = va_put_surface(va_dpy, nv12_surface_id, &src_rect, &dst_rect);
+  CHECK_VASTATUS(va_status, "vaPutSurface");
+
+  vaDestroySurfaces(va_dpy,&yuy2_surface_id,1);
+  free(sa_eb.buffers);
+  return;
+}
+
+static void init_dmabuf(void){
+  int ret;
+  struct v4l2_requestbuffers reqbuf;
+
+  memset(&reqbuf, 0, sizeof(reqbuf));
+  reqbuf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  reqbuf.memory = V4L2_MEMORY_DMABUF;
+  reqbuf.count = vo.buffer_num;
+
+  ret = ioctl(dev_fd, VIDIOC_REQBUFS, &reqbuf);
+  if(ret == -1 && errno == EINVAL){
+    fprintf(stderr, "Video capturing or DMABUF streaming is not supported\n");
+    exit(1);
+  }
+  else
+    CHECK_V4L2ERROR(ret, "VIDIOC_REQBUFS");
+ 
+  create_dmasharing_buffers();
+  printf("Succeed to create %d dma buffers \n", vo.buffer_num);
+
+}
+
+static void init_device(void){
+
+  int ret;
+  struct v4l2_capability cap;
+  struct v4l2_format format;
+
+  dev_fd = open(vo.dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (dev_fd < 0) {
+    fprintf(stderr, "Can not open %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+
+  memset(&cap, 0, sizeof(cap));
+  ret = ioctl(dev_fd, VIDIOC_QUERYCAP, &cap);
+  CHECK_V4L2ERROR(ret, "VIDIOC_QUERYCAP");
+  if(!(cap.capabilities & V4L2_CAP_VIDEO_CAPTURE)){
+    fprintf(stderr, "The device is not video capture device\n");
+    exit(1);
+  }
+  if(!(cap.capabilities & V4L2_CAP_STREAMING)){
+    fprintf(stderr, "The device does not support streaming i/o\n");
+    exit(1);
+  }
+
+  memset(&format, 0, sizeof(format));
+  format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  format.fmt.pix.width = vo.width;
+  format.fmt.pix.height = vo.height;
+  format.fmt.pix.pixelformat = V4L2_PIX_FMT_YUYV;
+  format.fmt.pix.field = V4L2_FIELD_ANY;
+
+  ret = ioctl(dev_fd, VIDIOC_S_FMT, &format);
+  CHECK_V4L2ERROR(ret, "VIDIOC_S_FMT");
+
+  ret = ioctl(dev_fd, VIDIOC_G_FMT, &format);
+  CHECK_V4L2ERROR(ret, "VIDIOC_G_FMT");
+  if(format.fmt.pix.width != vo.width  || format.fmt.pix.height != vo.height){
+    fprintf(stderr, "This resolution is not supported, please go through supported resolution by command './main -l'\n");
+    exit(1);
+  }
+  printf("Input image format: (width, height) = (%u, %u), pixel format = %.4s\n",
+      format.fmt.pix.width, format.fmt.pix.height, (char*)&format.fmt.pix.pixelformat);
+  image_size = format.fmt.pix.sizeimage;
+	pitch = format.fmt.pix.bytesperline;
+}
+
+static void start_capturing(void){
+  int ret;
+  for (unsigned int i = 0; i < vo.buffer_num; ++i) {
+    struct v4l2_buffer buf;
+
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    buf.index = i;
+    buf.m.fd = import_buf_fd[i];
+    ret = ioctl(dev_fd, VIDIOC_QBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_QBUF");
+  }
+
+  int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  ret = ioctl(dev_fd, VIDIOC_STREAMON, &type);
+  CHECK_V4L2ERROR(ret, "VIDIOC_STREAMON");
+}
+
+static void mainloop(void){
+  int ret;
+  struct v4l2_buffer buf;
+  int index;
+
+  while (1) {
+    frame_count++;
+    printf("******************Frame %d\n", frame_count);
+    fd_set fds;
+    struct timeval tv;
+    int r;
+
+    FD_ZERO(&fds);
+    FD_SET(dev_fd, &fds);
+
+    /* Timeout. */
+    tv.tv_sec = 2;
+    tv.tv_usec = 0;
+
+
+    r = select(dev_fd + 1, &fds, NULL, NULL, &tv);
+
+    if (-1 == r) {
+      if (EINTR == errno)
+        continue;
+      perror("select");
+    }
+
+    if(r == 0){
+      fprintf(stderr, "Select timeout\n");
+      exit(1);
+    }
+
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    ret = ioctl(dev_fd, VIDIOC_DQBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_DQBUF");
+    index = buf.index;
+
+    //process by ocl and show on screen by libva
+    process_show_frame(index);
+
+    //Then queue this buffer(buf.index) by QBUF
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    buf.m.fd = import_buf_fd[index];
+    buf.index = index;
+
+    ret = ioctl(dev_fd, VIDIOC_QBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_QBUF");
+  }
+}
+
+static void stop_capturing(void)
+{
+  int ret;
+  int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+
+  ret = ioctl(dev_fd, VIDIOC_STREAMOFF, &type);
+  CHECK_V4L2ERROR(ret, "VIDIOC_STREAMOFF");
+}
+
+static void uninit_device(void){
+  free(import_buf_fd);
+  free(input_buffer);
+  int ret = close(dev_fd);
+  if (ret) {
+    fprintf(stderr, "Failed to close %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  analyse_args(argc, argv);
+
+  init_device();
+  initialize_va_ocl();
+  init_dmabuf();
+
+  start_capturing();
+  mainloop();
+
+  stop_capturing();
+  release_va_ocl();
+  uninit_device();
+
+  return 0;
+}
diff --git a/kernels/runtime_yuy2_processing.cl b/kernels/runtime_yuy2_processing.cl
new file mode 100644
index 0000000..1478e65
--- /dev/null
+++ b/kernels/runtime_yuy2_processing.cl
@@ -0,0 +1,15 @@
+__kernel void
+runtime_yuy2_processing(__global uchar *src,
+                        int image_height,
+                        int image_pitch)
+{
+  int gx = get_global_id(0);
+  int gy = get_global_id(1);
+
+  int src_y = image_height / 2 + gy;
+  int mirror_y = image_height - src_y;
+
+  uchar4 mirror_val = *(__global uchar4*)(src + mirror_y*image_pitch + gx*4);
+  *(__global uchar4*)(src + src_y*image_pitch + gx*4) = mirror_val;
+
+}
-- 
1.9.1


From jeff.mcgee at intel.com  Tue Mar 10 09:59:15 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Tue, 10 Mar 2015 09:59:15 -0700
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <20150310073730.GA3800@phenom.ffwll.local>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
Message-ID: <20150310165915.GG3263@jeffdesk>

On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> > From: Jeff McGee <jeff.mcgee at intel.com>
> > 
> > tests/core_getparams needs the new libdrm interfaces for
> > querying subslice and EU counts.
> > 
> > For: VIZ-4636
> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > ---
> >  configure.ac | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/configure.ac b/configure.ac
> > index 16d6a2e..88a1c3d 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
> >  fi
> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
> >  
> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
> 
> Please don't and instead copypaste the new structs/defines with a local_
> prefix like we do it for all the other new igt testcases. Forcing libdrm
> to get updated for igt all the time can get annoying fast.
> -Daniel
> 
In this case I'm trying to exercise new API functions in libdrm which
wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
avoid requiring updated libdrm? I can do that, but it fails to test the
complete path that client would use.
-Jeff

> >  PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
> >  PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
> >  PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
> > -- 
> > 2.3.0
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From mattst88 at gmail.com  Tue Mar 10 10:48:33 2015
From: mattst88 at gmail.com (Matt Turner)
Date: Tue, 10 Mar 2015 10:48:33 -0700
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
 <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
Message-ID: <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>

On Mon, Mar 9, 2015 at 10:59 PM,  <xionghu.luo at intel.com> wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
>
> translate native mad to llvm.fma.

I'm not sure that it matters for this patch, but do we know if Gen's
MAD instruction is a fused-multiply-add? That is, does it not do an
intermediate rounding step after the multiply?

From mattst88 at gmail.com  Tue Mar 10 10:59:05 2015
From: mattst88 at gmail.com (Matt Turner)
Date: Tue, 10 Mar 2015 10:59:05 -0700
Subject: [Beignet] double precision support
In-Reply-To: <20150310091932.GF20578@ivb-gt2-rev4>
References: <54F86D69.9080209@gmail.com> <20150310091932.GF20578@ivb-gt2-rev4>
Message-ID: <CAEdQ38HRVqZ4V3ik9sxvvJnM8wxj0us2S2gV8m=2aiESE-9vpA@mail.gmail.com>

On Tue, Mar 10, 2015 at 2:19 AM, Zhigang Gong
<zhigang.gong at linux.intel.com> wrote:
> 2. The double support is not fully supported. For example, all the math
>    functions and even the divide instruction is not supported.

You're right that the hardware doesn't natively do most of the math
operations on doubles (it even doesn't do floor/ceil/trunc!), but this
BSpec page [0] does describe using features new to Broadwell to get
IEEE-compliant fdiv and sqrt for both single-precision and
double-precision.

It uses the new INVM and RSQRTM math operations, the new MADM
instruction, and the additional accumulator registers.

It seems that INVM/RSQRTM always write the flag register (the
math.eo.f0 apparently means "early out", it only seems to be
documented in passing on that page) in order to skip some instructions
when not necessary.

[0] 3D-Media-GPGPU Engine > EU Overview > ISA Introduction >
Instruction Set Reference > EUISA Instructions > math – Extended Math
Function [SNB+]

From jeff.mcgee at intel.com  Tue Mar 10 11:34:58 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Tue, 10 Mar 2015 11:34:58 -0700
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
 <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
Message-ID: <20150310183458.GH3263@jeffdesk>

On Tue, Mar 10, 2015 at 01:58:52PM -0400, Rob Clark wrote:
> On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> > On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
> >> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> >> > From: Jeff McGee <jeff.mcgee at intel.com>
> >> >
> >> > tests/core_getparams needs the new libdrm interfaces for
> >> > querying subslice and EU counts.
> >> >
> >> > For: VIZ-4636
> >> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> >> > ---
> >> >  configure.ac | 2 +-
> >> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >> >
> >> > diff --git a/configure.ac b/configure.ac
> >> > index 16d6a2e..88a1c3d 100644
> >> > --- a/configure.ac
> >> > +++ b/configure.ac
> >> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
> >> >  fi
> >> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
> >> >
> >> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> >> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
> >>
> >> Please don't and instead copypaste the new structs/defines with a local_
> >> prefix like we do it for all the other new igt testcases. Forcing libdrm
> >> to get updated for igt all the time can get annoying fast.
> >> -Daniel
> >>
> > In this case I'm trying to exercise new API functions in libdrm which
> > wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
> > avoid requiring updated libdrm? I can do that, but it fails to test the
> > complete path that client would use.
> 
> 
> Am I missing something, or does 2.4.60 not exist yet?
> 
> That said dependency bumps for igt seem like less of an issue than
> dependency bumps for mesa..  I mean if you are using igt you are
> probably on the latest anyways..  I'm not sure why Daniel is so
> concerned about that..
> 
> (but dependency bumps to something that doesn't exist yet should
> perhaps be avoided)
> 
> BR,
> -R
> 

Hi Rob. This igt change is contigent upon my libdrm changes which
would in fact bump that version to 2.4.60 after adding an API. That
change is also posted and waiting review. I guess I should have stated
that dependency here to begin with.

http://lists.freedesktop.org/archives/intel-gfx/2015-March/061101.html

Jeff
> 
> > -Jeff
> >
> >> >  PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
> >> >  PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
> >> >  PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
> >> > --
> >> > 2.3.0
> >> >
> >> > _______________________________________________
> >> > Intel-gfx mailing list
> >> > Intel-gfx at lists.freedesktop.org
> >> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> >>
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> >> _______________________________________________
> >> Beignet mailing list
> >> Beignet at lists.freedesktop.org
> >> http://lists.freedesktop.org/mailman/listinfo/beignet
> > _______________________________________________
> > dri-devel mailing list
> > dri-devel at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/dri-devel
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

From daniel at ffwll.ch  Tue Mar 10 11:47:03 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Tue, 10 Mar 2015 19:47:03 +0100
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
 <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
Message-ID: <20150310184703.GS3800@phenom.ffwll.local>

On Tue, Mar 10, 2015 at 01:58:52PM -0400, Rob Clark wrote:
> On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> > On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
> >> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> >> > From: Jeff McGee <jeff.mcgee at intel.com>
> >> >
> >> > tests/core_getparams needs the new libdrm interfaces for
> >> > querying subslice and EU counts.
> >> >
> >> > For: VIZ-4636
> >> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> >> > ---
> >> >  configure.ac | 2 +-
> >> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >> >
> >> > diff --git a/configure.ac b/configure.ac
> >> > index 16d6a2e..88a1c3d 100644
> >> > --- a/configure.ac
> >> > +++ b/configure.ac
> >> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
> >> >  fi
> >> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
> >> >
> >> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> >> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
> >>
> >> Please don't and instead copypaste the new structs/defines with a local_
> >> prefix like we do it for all the other new igt testcases. Forcing libdrm
> >> to get updated for igt all the time can get annoying fast.
> >> -Daniel
> >>
> > In this case I'm trying to exercise new API functions in libdrm which
> > wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
> > avoid requiring updated libdrm? I can do that, but it fails to test the
> > complete path that client would use.
> 
> 
> Am I missing something, or does 2.4.60 not exist yet?
> 
> That said dependency bumps for igt seem like less of an issue than
> dependency bumps for mesa..  I mean if you are using igt you are
> probably on the latest anyways..  I'm not sure why Daniel is so
> concerned about that..
> 
> (but dependency bumps to something that doesn't exist yet should
> perhaps be avoided)

I'd like to avoid massive depency loops for igt tests so that I can merge
the testcase right when the patches land in -nightly. Otherwise there's
always a small delay involved where regression can creep in. Also if I
have to update libdrm every time I update igt that's annoying since
without that I don't have to install/update anything at all - I run igt
in-place. And we've used the LOCAL_ prefixes for pretty much every abi
addition in igt tests thus far.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From jeff.mcgee at intel.com  Tue Mar 10 13:06:44 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Tue, 10 Mar 2015 13:06:44 -0700
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <20150310184703.GS3800@phenom.ffwll.local>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
 <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
 <20150310184703.GS3800@phenom.ffwll.local>
Message-ID: <20150310200644.GI3263@jeffdesk>

On Tue, Mar 10, 2015 at 07:47:03PM +0100, Daniel Vetter wrote:
> On Tue, Mar 10, 2015 at 01:58:52PM -0400, Rob Clark wrote:
> > On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> > > On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
> > >> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> > >> > From: Jeff McGee <jeff.mcgee at intel.com>
> > >> >
> > >> > tests/core_getparams needs the new libdrm interfaces for
> > >> > querying subslice and EU counts.
> > >> >
> > >> > For: VIZ-4636
> > >> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > >> > ---
> > >> >  configure.ac | 2 +-
> > >> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >> >
> > >> > diff --git a/configure.ac b/configure.ac
> > >> > index 16d6a2e..88a1c3d 100644
> > >> > --- a/configure.ac
> > >> > +++ b/configure.ac
> > >> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
> > >> >  fi
> > >> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
> > >> >
> > >> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> > >> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
> > >>
> > >> Please don't and instead copypaste the new structs/defines with a local_
> > >> prefix like we do it for all the other new igt testcases. Forcing libdrm
> > >> to get updated for igt all the time can get annoying fast.
> > >> -Daniel
> > >>
> > > In this case I'm trying to exercise new API functions in libdrm which
> > > wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
> > > avoid requiring updated libdrm? I can do that, but it fails to test the
> > > complete path that client would use.
> > 
> > 
> > Am I missing something, or does 2.4.60 not exist yet?
> > 
> > That said dependency bumps for igt seem like less of an issue than
> > dependency bumps for mesa..  I mean if you are using igt you are
> > probably on the latest anyways..  I'm not sure why Daniel is so
> > concerned about that..
> > 
> > (but dependency bumps to something that doesn't exist yet should
> > perhaps be avoided)
> 
> I'd like to avoid massive depency loops for igt tests so that I can merge
> the testcase right when the patches land in -nightly. Otherwise there's
> always a small delay involved where regression can creep in. Also if I
> have to update libdrm every time I update igt that's annoying since
> without that I don't have to install/update anything at all - I run igt
> in-place. And we've used the LOCAL_ prefixes for pretty much every abi
> addition in igt tests thus far.
> -Daniel

I understand that and it certainly makes sense when libdrm is only
providing defines or structs. But as I said, in this case there is
code in libdrm (the wrapper) that we could test as part of the
complete path. Are you recommending that I implement duplicate
wrapper functions in igt with the local prefix?
-Jeff

From ruiling.song at intel.com  Tue Mar 10 18:55:28 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Wed, 11 Mar 2015 01:55:28 +0000
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
 <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
 <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7EA6BC@SHSMSX101.ccr.corp.intel.com>


> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Matt Turner
> Sent: Wednesday, March 11, 2015 1:49 AM
> To: Luo, Xionghu
> Cc: beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
> 
> On Mon, Mar 9, 2015 at 10:59 PM,  <xionghu.luo at intel.com> wrote:
> > From: Luo Xionghu <xionghu.luo at intel.com>
> >
> > translate native mad to llvm.fma.
> 
> I'm not sure that it matters for this patch, but do we know if Gen's MAD
> instruction is a fused-multiply-add? That is, does it not do an intermediate
> rounding step after the multiply?
I also have such kind of concern, so I did a simple test:
on cpu side, I use "reference = (double)x1*(double)x2 + (double)x3;"
And on gpu side, I use "result = mad(x1, x2, x3);"
Then compare the result and reference, the bits are exactly the same, so I think gen's MAD does not do intermediate rounding after multiply.
We can use Gen's MAD as fma:)
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From mattst88 at gmail.com  Tue Mar 10 19:19:45 2015
From: mattst88 at gmail.com (Matt Turner)
Date: Tue, 10 Mar 2015 19:19:45 -0700
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <148B1B7A67D1C24B9EF0BE42EA4977062B7EA6BC@SHSMSX101.ccr.corp.intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
 <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
 <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7EA6BC@SHSMSX101.ccr.corp.intel.com>
Message-ID: <CAEdQ38GD8hYpJXMnfnvnKT+utOhDpN3D5d7JApzSyGT9QUPLbQ@mail.gmail.com>

On Tue, Mar 10, 2015 at 6:55 PM, Song, Ruiling <ruiling.song at intel.com> wrote:
>> I'm not sure that it matters for this patch, but do we know if Gen's MAD
>> instruction is a fused-multiply-add? That is, does it not do an intermediate
>> rounding step after the multiply?
> I also have such kind of concern, so I did a simple test:
> on cpu side, I use "reference = (double)x1*(double)x2 + (double)x3;"

Some recent CPUs have FMA instructions. You should make sure you know
whether your code is compiled using FMA or not.

> And on gpu side, I use "result = mad(x1, x2, x3);"
> Then compare the result and reference, the bits are exactly the same, so I think gen's MAD does not do intermediate rounding after multiply.

The intermediate rounding step will not affect many pairs of numbers
that are multiplied together. You need to make sure you're testing a
pair of numbers that are affected by the intermediate rounding step.

I wrote a small program to find cases where fmaf(x, y, z) != x*y+z
(attached). Compile with -std=c99 -O2 -march=native -lm. I'm testing
on Haswell which has FMA.

It shows that

fmaf(1, 0.333333, 0.666667) is 1 (0x1.000002p+0), but 1 * 0.333333 +
0.666667 is 1 (0x1p+0)

Please test that Gen's MAD instruction produces what fmaf() produces
for 1.0 * 0.333333 + 0.666667.

Assuming glibc's fmaf() is correct... I'm again surprised by
floating-point numbers. :)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: fma.c
Type: text/x-csrc
Size: 493 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20150310/593e9593/attachment.c>

From ruiling.song at intel.com  Tue Mar 10 20:04:48 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Wed, 11 Mar 2015 03:04:48 +0000
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <CAEdQ38GD8hYpJXMnfnvnKT+utOhDpN3D5d7JApzSyGT9QUPLbQ@mail.gmail.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
 <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
 <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7EA6BC@SHSMSX101.ccr.corp.intel.com>
 <CAEdQ38GD8hYpJXMnfnvnKT+utOhDpN3D5d7JApzSyGT9QUPLbQ@mail.gmail.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7EB738@SHSMSX101.ccr.corp.intel.com>


> -----Original Message-----
> From: Matt Turner [mailto:mattst88 at gmail.com]
> Sent: Wednesday, March 11, 2015 10:20 AM
> To: Song, Ruiling
> Cc: Luo, Xionghu; beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
> 
> On Tue, Mar 10, 2015 at 6:55 PM, Song, Ruiling <ruiling.song at intel.com>
> wrote:
> >> I'm not sure that it matters for this patch, but do we know if Gen's
> >> MAD instruction is a fused-multiply-add? That is, does it not do an
> >> intermediate rounding step after the multiply?
> > I also have such kind of concern, so I did a simple test:
> > on cpu side, I use "reference = (double)x1*(double)x2 + (double)x3;"
> 
> Some recent CPUs have FMA instructions. You should make sure you know
> whether your code is compiled using FMA or not.
> 
> > And on gpu side, I use "result = mad(x1, x2, x3);"
> > Then compare the result and reference, the bits are exactly the same, so I
> think gen's MAD does not do intermediate rounding after multiply.
> 
> The intermediate rounding step will not affect many pairs of numbers that
> are multiplied together. You need to make sure you're testing a pair of
> numbers that are affected by the intermediate rounding step.
> 
> I wrote a small program to find cases where fmaf(x, y, z) != x*y+z (attached).
> Compile with -std=c99 -O2 -march=native -lm. I'm testing on Haswell which
> has FMA.
> 
> It shows that
> 
> fmaf(1, 0.333333, 0.666667) is 1 (0x1.000002p+0), but 1 * 0.333333 +
> 0.666667 is 1 (0x1p+0)
> 
> Please test that Gen's MAD instruction produces what fmaf() produces for
> 1.0 * 0.333333 + 0.666667.
I tried these number, the binary representation of 0.333333 is 0x1.55553ep-2
The binary representation of 0.666667 is 0x1.5555p-1
I manually sum it up. The mantissa bits is 24 bits ones (here not counting in the hidden one). As floating point only has 23 bits mantissa,
I don't know how to round it here, if select to round up, the result would be 0x1p0. I need to check IEEE754 spec.
But it cannot generate 0x1.000002p+0.
I think you'd better not output using %g, using %g would not show its exact binary representation. I always like %a representation.
> 
> Assuming glibc's fmaf() is correct... I'm again surprised by floating-point
> numbers. :)

From ruiling.song at intel.com  Tue Mar 10 20:07:37 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Wed, 11 Mar 2015 03:07:37 +0000
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <148B1B7A67D1C24B9EF0BE42EA4977062B7EB738@SHSMSX101.ccr.corp.intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
 <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
 <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7EA6BC@SHSMSX101.ccr.corp.intel.com>
 <CAEdQ38GD8hYpJXMnfnvnKT+utOhDpN3D5d7JApzSyGT9QUPLbQ@mail.gmail.com>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7EB738@SHSMSX101.ccr.corp.intel.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7EB74F@SHSMSX101.ccr.corp.intel.com>


> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Song, Ruiling
> Sent: Wednesday, March 11, 2015 11:05 AM
> To: Matt Turner
> Cc: Luo, Xionghu; beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
> 
> 
> 
> > -----Original Message-----
> > From: Matt Turner [mailto:mattst88 at gmail.com]
> > Sent: Wednesday, March 11, 2015 10:20 AM
> > To: Song, Ruiling
> > Cc: Luo, Xionghu; beignet at lists.freedesktop.org
> > Subject: Re: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
> >
> > On Tue, Mar 10, 2015 at 6:55 PM, Song, Ruiling
> > <ruiling.song at intel.com>
> > wrote:
> > >> I'm not sure that it matters for this patch, but do we know if
> > >> Gen's MAD instruction is a fused-multiply-add? That is, does it not
> > >> do an intermediate rounding step after the multiply?
> > > I also have such kind of concern, so I did a simple test:
> > > on cpu side, I use "reference = (double)x1*(double)x2 + (double)x3;"
> >
> > Some recent CPUs have FMA instructions. You should make sure you know
> > whether your code is compiled using FMA or not.
> >
> > > And on gpu side, I use "result = mad(x1, x2, x3);"
> > > Then compare the result and reference, the bits are exactly the
> > > same, so I
> > think gen's MAD does not do intermediate rounding after multiply.
> >
> > The intermediate rounding step will not affect many pairs of numbers
> > that are multiplied together. You need to make sure you're testing a
> > pair of numbers that are affected by the intermediate rounding step.
> >
> > I wrote a small program to find cases where fmaf(x, y, z) != x*y+z
> (attached).
> > Compile with -std=c99 -O2 -march=native -lm. I'm testing on Haswell
> > which has FMA.
> >
> > It shows that
> >
> > fmaf(1, 0.333333, 0.666667) is 1 (0x1.000002p+0), but 1 * 0.333333 +
> > 0.666667 is 1 (0x1p+0)
> >
> > Please test that Gen's MAD instruction produces what fmaf() produces
> > for
> > 1.0 * 0.333333 + 0.666667.
> I tried these number, the binary representation of 0.333333 is 0x1.55553ep-2
> The binary representation of 0.666667 is 0x1.5555p-1 I manually sum it up.

Sorry, typo here, should be " binary representation of 0.666667 is 0x1.55556p-1"

> The mantissa bits is 24 bits ones (here not counting in the hidden one). As
> floating point only has 23 bits mantissa, I don't know how to round it here, if
> select to round up, the result would be 0x1p0. I need to check IEEE754 spec.
> But it cannot generate 0x1.000002p+0.
> I think you'd better not output using %g, using %g would not show its exact
> binary representation. I always like %a representation.
> >
> > Assuming glibc's fmaf() is correct... I'm again surprised by
> > floating-point numbers. :)
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From ruiling.song at intel.com  Tue Mar 10 20:38:14 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Wed, 11 Mar 2015 03:38:14 +0000
Subject: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
In-Reply-To: <CAEdQ38GD8hYpJXMnfnvnKT+utOhDpN3D5d7JApzSyGT9QUPLbQ@mail.gmail.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
 <1425967188-22075-6-git-send-email-xionghu.luo@intel.com>
 <CAEdQ38GTpesq8qGadXkF9c6rdimNtyEnbxT2LNU3-7XjYrToXg@mail.gmail.com>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7EA6BC@SHSMSX101.ccr.corp.intel.com>
 <CAEdQ38GD8hYpJXMnfnvnKT+utOhDpN3D5d7JApzSyGT9QUPLbQ@mail.gmail.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7EB788@SHSMSX101.ccr.corp.intel.com>


> -----Original Message-----
> From: Matt Turner [mailto:mattst88 at gmail.com]
> Sent: Wednesday, March 11, 2015 10:20 AM
> To: Song, Ruiling
> Cc: Luo, Xionghu; beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH 6/7] replace mad with llvm intrinsic.
> 
> On Tue, Mar 10, 2015 at 6:55 PM, Song, Ruiling <ruiling.song at intel.com>
> wrote:
> >> I'm not sure that it matters for this patch, but do we know if Gen's
> >> MAD instruction is a fused-multiply-add? That is, does it not do an
> >> intermediate rounding step after the multiply?
> > I also have such kind of concern, so I did a simple test:
> > on cpu side, I use "reference = (double)x1*(double)x2 + (double)x3;"
> 
> Some recent CPUs have FMA instructions. You should make sure you know
> whether your code is compiled using FMA or not.
> 
> > And on gpu side, I use "result = mad(x1, x2, x3);"
> > Then compare the result and reference, the bits are exactly the same, so I
> think gen's MAD does not do intermediate rounding after multiply.
> 
> The intermediate rounding step will not affect many pairs of numbers that
> are multiplied together. You need to make sure you're testing a pair of
> numbers that are affected by the intermediate rounding step.
> 
> I wrote a small program to find cases where fmaf(x, y, z) != x*y+z (attached).
> Compile with -std=c99 -O2 -march=native -lm. I'm testing on Haswell which
> has FMA.
> 
> It shows that
> 
> fmaf(1, 0.333333, 0.666667) is 1 (0x1.000002p+0), but 1 * 0.333333 +
> 0.666667 is 1 (0x1p+0)
> 
> Please test that Gen's MAD instruction produces what fmaf() produces for
> 1.0 * 0.333333 + 0.666667.
> 
> Assuming glibc's fmaf() is correct... I'm again surprised by floating-point
> numbers. :)

My gcc doesn't have nextafterf and fmaf definition, and I use g++ to build on my ivb machine.
g++  -O2 -march=native -lm -o fma fma.c
its output ( I changed to use "%a" in printf):
fmaf(0x1.000002p+0, 0x1.555556p-2, 0x1.555556p-1) is 1 (0x1.000002p+0), but 0x1.000002p+0 * 0x1.555556p-2 + 0x1.555556p-1 is 1 (0x1p+0)
and I tried using gen's MAD, its result is same as fmaf. You can have a try on your haswell machine. I think the result would be the same.

From ruiling.song at intel.com  Tue Mar 10 23:39:24 2015
From: ruiling.song at intel.com (Ruiling Song)
Date: Wed, 11 Mar 2015 14:39:24 +0800
Subject: [Beignet] [PATCH] GBE: Only emit multiply when immediate is not one.
Message-ID: <1426055964-8851-1-git-send-email-ruiling.song@intel.com>

As constant propagation will introduce constantExpr and gep instruction,
I choose not to run constant propagation pass after RemoveGep pass.
So, here only generate Multiply as needed.
We may do such kind of optimization in Gen IR level in the future.

This could fix the performance regression introduced by:
"GBE: Import constantexpr lower pass from pNaCl"
to the opencv case:
opencv_perf_imgproc/OCL_BilateralFixture_Bilateral

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/llvm/llvm_passes.cpp |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 1b40389..223f61b 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -346,9 +346,11 @@ namespace gbe
           }
         }
 
-        BinaryOperator* tmpMul = 
-          BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
-              "", GEPInst);
+        Value* tmpMul = operand;
+        if (size != 1) {
+          tmpMul = BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
+                                         "", GEPInst);
+        }
         currentAddrInst = 
           BinaryOperator::Create(Instruction::Add, currentAddrInst, tmpMul,
               "", GEPInst);
-- 
1.7.10.4


From zhenyuw at linux.intel.com  Tue Mar 10 23:41:25 2015
From: zhenyuw at linux.intel.com (Zhenyu Wang)
Date: Wed, 11 Mar 2015 14:41:25 +0800
Subject: [Beignet] [Intel-gfx] [PATCH] drm/i915: Add soft-pinning API
	for execbuffer
In-Reply-To: <1425635047-25214-1-git-send-email-chris@chris-wilson.co.uk>
References: <1425635047-25214-1-git-send-email-chris@chris-wilson.co.uk>
Message-ID: <20150311064125.GC1872@zhen-hp.sh.intel.com>

On 2015.03.06 09:44:07 +0000, Chris Wilson wrote:
> Userspace can pass in an offset that it presumes the object is located
> at. The kernel will then do its utmost to fit the object into that
> location. The assumption is that userspace is handling its own object
> locations (for example along with full-ppgtt) and that the kernel will
> rarely have to make space for the user's requests.
> 

Chris, would you add libdrm support for this? e.g beignet doesn't
handle exec object itself but use libdrm.

-- 
Open Source Technology Center, Intel ltd.

$gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20150311/38f389c4/attachment.sig>

From daniel at ffwll.ch  Wed Mar 11 00:21:36 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Wed, 11 Mar 2015 08:21:36 +0100
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <20150310200644.GI3263@jeffdesk>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
 <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
 <20150310184703.GS3800@phenom.ffwll.local>
 <20150310200644.GI3263@jeffdesk>
Message-ID: <20150311072136.GW3800@phenom.ffwll.local>

On Tue, Mar 10, 2015 at 01:06:44PM -0700, Jeff McGee wrote:
> On Tue, Mar 10, 2015 at 07:47:03PM +0100, Daniel Vetter wrote:
> > On Tue, Mar 10, 2015 at 01:58:52PM -0400, Rob Clark wrote:
> > > On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> > > > On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
> > > >> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> > > >> > From: Jeff McGee <jeff.mcgee at intel.com>
> > > >> >
> > > >> > tests/core_getparams needs the new libdrm interfaces for
> > > >> > querying subslice and EU counts.
> > > >> >
> > > >> > For: VIZ-4636
> > > >> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > > >> > ---
> > > >> >  configure.ac | 2 +-
> > > >> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > >> >
> > > >> > diff --git a/configure.ac b/configure.ac
> > > >> > index 16d6a2e..88a1c3d 100644
> > > >> > --- a/configure.ac
> > > >> > +++ b/configure.ac
> > > >> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
> > > >> >  fi
> > > >> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
> > > >> >
> > > >> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> > > >> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
> > > >>
> > > >> Please don't and instead copypaste the new structs/defines with a local_
> > > >> prefix like we do it for all the other new igt testcases. Forcing libdrm
> > > >> to get updated for igt all the time can get annoying fast.
> > > >> -Daniel
> > > >>
> > > > In this case I'm trying to exercise new API functions in libdrm which
> > > > wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
> > > > avoid requiring updated libdrm? I can do that, but it fails to test the
> > > > complete path that client would use.
> > > 
> > > 
> > > Am I missing something, or does 2.4.60 not exist yet?
> > > 
> > > That said dependency bumps for igt seem like less of an issue than
> > > dependency bumps for mesa..  I mean if you are using igt you are
> > > probably on the latest anyways..  I'm not sure why Daniel is so
> > > concerned about that..
> > > 
> > > (but dependency bumps to something that doesn't exist yet should
> > > perhaps be avoided)
> > 
> > I'd like to avoid massive depency loops for igt tests so that I can merge
> > the testcase right when the patches land in -nightly. Otherwise there's
> > always a small delay involved where regression can creep in. Also if I
> > have to update libdrm every time I update igt that's annoying since
> > without that I don't have to install/update anything at all - I run igt
> > in-place. And we've used the LOCAL_ prefixes for pretty much every abi
> > addition in igt tests thus far.
> > -Daniel
> 
> I understand that and it certainly makes sense when libdrm is only
> providing defines or structs. But as I said, in this case there is
> code in libdrm (the wrapper) that we could test as part of the
> complete path. Are you recommending that I implement duplicate
> wrapper functions in igt with the local prefix?

Sorry I totally didn't realize that. Generally we don't have a lot of igt
testcase for libdrm really, imo it's usually simpler to just add the
interface to each part. Since this is such a simple one there's no need to
have a low-level test and the libdrm test on top, but that's what I'd do
if there's something worth testing in libdrm. Because for complex
functionality I really want to get the bare-metal tests in together with
the kernel part. Stalling for libdrm release could take longer.

And yes, personally I'd just have open-coded the getparam call here in
igt, but that's just a bikeshed.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From zhigang.gong at linux.intel.com  Tue Mar 10 23:35:35 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Wed, 11 Mar 2015 14:35:35 +0800
Subject: [Beignet] double precision support
In-Reply-To: <CAEdQ38HRVqZ4V3ik9sxvvJnM8wxj0us2S2gV8m=2aiESE-9vpA@mail.gmail.com>
References: <54F86D69.9080209@gmail.com> <20150310091932.GF20578@ivb-gt2-rev4>
 <CAEdQ38HRVqZ4V3ik9sxvvJnM8wxj0us2S2gV8m=2aiESE-9vpA@mail.gmail.com>
Message-ID: <20150311063534.GG20578@ivb-gt2-rev4>

On Tue, Mar 10, 2015 at 10:59:05AM -0700, Matt Turner wrote:
> On Tue, Mar 10, 2015 at 2:19 AM, Zhigang Gong
> <zhigang.gong at linux.intel.com> wrote:
> > 2. The double support is not fully supported. For example, all the math
> >    functions and even the divide instruction is not supported.
> 
> You're right that the hardware doesn't natively do most of the math
> operations on doubles (it even doesn't do floor/ceil/trunc!), but this
> BSpec page [0] does describe using features new to Broadwell to get
> IEEE-compliant fdiv and sqrt for both single-precision and
> double-precision.
> 
> It uses the new INVM and RSQRTM math operations, the new MADM
> instruction, and the additional accumulator registers.
> 
> It seems that INVM/RSQRTM always write the flag register (the
> math.eo.f0 apparently means "early out", it only seems to be
> documented in passing on that page) in order to skip some instructions
> when not necessary.
> 
> [0] 3D-Media-GPGPU Engine > EU Overview > ISA Introduction >
> Instruction Set Reference > EUISA Instructions > math – Extended Math
> Function [SNB+]

Thanks for pointing this out. We confirmed that to implement double
precision support on BDW+ is feasible. We will consider to implement
it after we get most of the OpenCL 2.0 feature done.

Thanks,
Zhigang Gong. 

From zhigang.gong at linux.intel.com  Wed Mar 11 19:08:54 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Thu, 12 Mar 2015 10:08:54 +0800
Subject: [Beignet] [PATCH 2/2 v2] Query the driver directly for
	compute	units and subslice
In-Reply-To: <1425944158-28223-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339759-19027-2-git-send-email-jeff.mcgee@intel.com>
 <1425944158-28223-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <000501d05c69$7eb113c0$7c133b40$@linux.intel.com>

LGTM,

Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>

Thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> jeff.mcgee at intel.com
> Sent: Tuesday, March 10, 2015 7:36 AM
> To: beignet at lists.freedesktop.org
> Cc: intel-gfx at lists.freedesktop.org; dri-devel at lists.freedesktop.org
> Subject: [Beignet] [PATCH 2/2 v2] Query the driver directly for compute units
> and subslice
> 
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> Values of device max compute units and max subslice obtained directly from
> the driver should be more accurate than our own ID-based lookup values. This
> is particularly important when a single device ID may encompass more than
> one configuration. If the driver cannot provide a valid value for the given device,
> we fallback on the ID-based lookup value.
> 
> This query requires libdrm 2.4.60. For now we will consider the use of this query
> to be optional and exclude it from compilation when building against older
> libdrm. Later we may want to consider requiring the query or at least warning
> more strongly when it is not supported.
> 
> v2: Make feature use conditional on libdrm version (Zhigang).
> 
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  CMakeLists.txt           |  9 +++++++++
>  src/CMakeLists.txt       | 10 ++++++++++
>  src/intel/intel_driver.c | 25 +++++++++++++++++++++----
>  3 files changed, 40 insertions(+), 4 deletions(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt index 65f2c70..bb03566 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -131,6 +131,15 @@ IF(DRM_INTEL_FOUND)
>    ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
>      MESSAGE(STATUS "Disable userptr support")
>    ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
> +  IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
> +    MESSAGE(STATUS "Enable EU total query support")
> +    SET(DRM_INTEL_EU_TOTAL "enable")
> +    MESSAGE(STATUS "Enable subslice total query support")
> +    SET(DRM_INTEL_SUBSLICE_TOTAL "enable")
> ELSE(DRM_INTEL_VERSION
> + VERSION_GREATER 2.4.59)
> +    MESSAGE(STATUS "Disable EU total query support")
> +    MESSAGE(STATUS "Disable subslice total query support")
> + ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.59)
>  ELSE(DRM_INTEL_FOUND)
>    MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
>  ENDIF(DRM_INTEL_FOUND)
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d4181d8..464765f
> 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -118,6 +118,16 @@ SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR
> ${CMAKE_CXX_FLAGS}")  SET(CMAKE_C_FLAGS "-DHAS_USERPTR
> ${CMAKE_C_FLAGS}")  endif (DRM_INTEL_USERPTR)
> 
> +if (DRM_INTEL_EU_TOTAL)
> +SET(CMAKE_CXX_FLAGS "-DHAS_EU_TOTAL ${CMAKE_CXX_FLAGS}")
> +SET(CMAKE_C_FLAGS "-DHAS_EU_TOTAL ${CMAKE_C_FLAGS}") endif
> +(DRM_INTEL_EU_TOTAL)
> +
> +if (DRM_INTEL_SUBSLICE_TOTAL)
> +SET(CMAKE_CXX_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_CXX_FLAGS}")
> +SET(CMAKE_C_FLAGS "-DHAS_SUBSLICE_TOTAL ${CMAKE_C_FLAGS}") endif
> +(DRM_INTEL_SUBSLICE_TOTAL)
> +
>  set(GIT_SHA1 "git_sha1.h")
>  add_custom_target(${GIT_SHA1} ALL
>    COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
> diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index
> d61988c..755ab6b 100644
> --- a/src/intel/intel_driver.c
> +++ b/src/intel/intel_driver.c
> @@ -757,10 +757,7 @@ static int intel_buffer_set_tiling(cl_buffer bo,  static
> void  intel_update_device_info(cl_device_id device)  { -#ifdef HAS_USERPTR
>    intel_driver_t *driver;
> -  const size_t sz = 4096;
> -  void *host_ptr;
> 
>    driver = intel_driver_new();
>    assert(driver != NULL);
> @@ -769,6 +766,10 @@ intel_update_device_info(cl_device_id device)
>      return;
>    }
> 
> +#ifdef HAS_USERPTR
> +  const size_t sz = 4096;
> +  void *host_ptr;
> +
>    host_ptr = cl_aligned_malloc(sz, 4096);
>    if (host_ptr != NULL) {
>      cl_buffer bo = intel_buffer_alloc_userptr((cl_buffer_mgr)driver->bufmgr,
> @@ -781,12 +782,28 @@ intel_update_device_info(cl_device_id device)
>    }
>    else
>      device->host_unified_memory = CL_FALSE;
> +#endif
> +
> +#ifdef HAS_EU_TOTAL
> +  unsigned int eu_total;
> +
> +  /* Prefer driver-queried max compute units if supported */
> +  if (!drm_intel_get_eu_total(driver->fd, &eu_total))
> +    device->max_compute_unit = eu_total; #endif
> +
> +#ifdef HAS_SUBSLICE_TOTAL
> +  unsigned int subslice_total;
> +
> +  /* Prefer driver-queried subslice count if supported */
> +  if (!drm_intel_get_subslice_total(driver->fd, &subslice_total))
> +    device->sub_slice_count = subslice_total; #endif
> 
>    intel_driver_context_destroy(driver);
>    intel_driver_close(driver);
>    intel_driver_terminate(driver);
>    intel_driver_delete(driver);
> -#endif
>  }
> 
>  LOCAL void
> --
> 2.3.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From robdclark at gmail.com  Tue Mar 10 10:58:52 2015
From: robdclark at gmail.com (Rob Clark)
Date: Tue, 10 Mar 2015 13:58:52 -0400
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <20150310165915.GG3263@jeffdesk>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
Message-ID: <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>

On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
>> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
>> > From: Jeff McGee <jeff.mcgee at intel.com>
>> >
>> > tests/core_getparams needs the new libdrm interfaces for
>> > querying subslice and EU counts.
>> >
>> > For: VIZ-4636
>> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
>> > ---
>> >  configure.ac | 2 +-
>> >  1 file changed, 1 insertion(+), 1 deletion(-)
>> >
>> > diff --git a/configure.ac b/configure.ac
>> > index 16d6a2e..88a1c3d 100644
>> > --- a/configure.ac
>> > +++ b/configure.ac
>> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
>> >  fi
>> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
>> >
>> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
>> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
>>
>> Please don't and instead copypaste the new structs/defines with a local_
>> prefix like we do it for all the other new igt testcases. Forcing libdrm
>> to get updated for igt all the time can get annoying fast.
>> -Daniel
>>
> In this case I'm trying to exercise new API functions in libdrm which
> wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
> avoid requiring updated libdrm? I can do that, but it fails to test the
> complete path that client would use.


Am I missing something, or does 2.4.60 not exist yet?

That said dependency bumps for igt seem like less of an issue than
dependency bumps for mesa..  I mean if you are using igt you are
probably on the latest anyways..  I'm not sure why Daniel is so
concerned about that..

(but dependency bumps to something that doesn't exist yet should
perhaps be avoided)

BR,
-R


> -Jeff
>
>> >  PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
>> >  PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
>> >  PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
>> > --
>> > 2.3.0
>> >
>> > _______________________________________________
>> > Intel-gfx mailing list
>> > Intel-gfx at lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>>
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
>> _______________________________________________
>> Beignet mailing list
>> Beignet at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

From robdclark at gmail.com  Tue Mar 10 11:24:30 2015
From: robdclark at gmail.com (Rob Clark)
Date: Tue, 10 Mar 2015 14:24:30 -0400
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <20150310183458.GH3263@jeffdesk>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
 <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
 <20150310183458.GH3263@jeffdesk>
Message-ID: <CAF6AEGvu1PPXQZbHXwMPPJK4ho_fTkYFC5kUYRmuZMhtJ4wFFw@mail.gmail.com>

On Tue, Mar 10, 2015 at 2:34 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> On Tue, Mar 10, 2015 at 01:58:52PM -0400, Rob Clark wrote:
>> On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
>> > On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
>> >> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
>> >> > From: Jeff McGee <jeff.mcgee at intel.com>
>> >> >
>> >> > tests/core_getparams needs the new libdrm interfaces for
>> >> > querying subslice and EU counts.
>> >> >
>> >> > For: VIZ-4636
>> >> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
>> >> > ---
>> >> >  configure.ac | 2 +-
>> >> >  1 file changed, 1 insertion(+), 1 deletion(-)
>> >> >
>> >> > diff --git a/configure.ac b/configure.ac
>> >> > index 16d6a2e..88a1c3d 100644
>> >> > --- a/configure.ac
>> >> > +++ b/configure.ac
>> >> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
>> >> >  fi
>> >> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
>> >> >
>> >> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
>> >> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
>> >>
>> >> Please don't and instead copypaste the new structs/defines with a local_
>> >> prefix like we do it for all the other new igt testcases. Forcing libdrm
>> >> to get updated for igt all the time can get annoying fast.
>> >> -Daniel
>> >>
>> > In this case I'm trying to exercise new API functions in libdrm which
>> > wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
>> > avoid requiring updated libdrm? I can do that, but it fails to test the
>> > complete path that client would use.
>>
>>
>> Am I missing something, or does 2.4.60 not exist yet?
>>
>> That said dependency bumps for igt seem like less of an issue than
>> dependency bumps for mesa..  I mean if you are using igt you are
>> probably on the latest anyways..  I'm not sure why Daniel is so
>> concerned about that..
>>
>> (but dependency bumps to something that doesn't exist yet should
>> perhaps be avoided)
>>
>> BR,
>> -R
>>
>
> Hi Rob. This igt change is contigent upon my libdrm changes which
> would in fact bump that version to 2.4.60 after adding an API. That
> change is also posted and waiting review. I guess I should have stated
> that dependency here to begin with.
>
> http://lists.freedesktop.org/archives/intel-gfx/2015-March/061101.html
>

ahh, my bad.. I hadn't read all of the threads.. sorry for the noise ;-)

BR,
-R

> Jeff
>>
>> > -Jeff
>> >
>> >> >  PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
>> >> >  PKG_CHECK_MODULES(OVERLAY_XVLIB, [xv x11 xext dri2proto >= 2.6], enable_overlay_xvlib=yes, enable_overlay_xvlib=no)
>> >> >  PKG_CHECK_MODULES(OVERLAY_XLIB, [cairo-xlib dri2proto >= 2.6], enable_overlay_xlib=yes, enable_overlay_xlib=no)
>> >> > --
>> >> > 2.3.0
>> >> >
>> >> > _______________________________________________
>> >> > Intel-gfx mailing list
>> >> > Intel-gfx at lists.freedesktop.org
>> >> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>> >>
>> >> --
>> >> Daniel Vetter
>> >> Software Engineer, Intel Corporation
>> >> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
>> >> _______________________________________________
>> >> Beignet mailing list
>> >> Beignet at lists.freedesktop.org
>> >> http://lists.freedesktop.org/mailman/listinfo/beignet
>> > _______________________________________________
>> > dri-devel mailing list
>> > dri-devel at lists.freedesktop.org
>> > http://lists.freedesktop.org/mailman/listinfo/dri-devel
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/dri-devel

From zhigang.gong at linux.intel.com  Thu Mar 12 01:10:27 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Thu, 12 Mar 2015 16:10:27 +0800
Subject: [Beignet] [PATCH] GBE: Only emit multiply when immediate is not
 one.
In-Reply-To: <1426055964-8851-1-git-send-email-ruiling.song@intel.com>
References: <1426055964-8851-1-git-send-email-ruiling.song@intel.com>
Message-ID: <20150312081026.GA21732@ivb-gt2-rev4>

Nice catch, ruiling. Just pushed.

Thanks,
Zhigang Gong.

On Wed, Mar 11, 2015 at 02:39:24PM +0800, Ruiling Song wrote:
> As constant propagation will introduce constantExpr and gep instruction,
> I choose not to run constant propagation pass after RemoveGep pass.
> So, here only generate Multiply as needed.
> We may do such kind of optimization in Gen IR level in the future.
> 
> This could fix the performance regression introduced by:
> "GBE: Import constantexpr lower pass from pNaCl"
> to the opencv case:
> opencv_perf_imgproc/OCL_BilateralFixture_Bilateral
> 
> Signed-off-by: Ruiling Song <ruiling.song at intel.com>
> ---
>  backend/src/llvm/llvm_passes.cpp |    8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
> index 1b40389..223f61b 100644
> --- a/backend/src/llvm/llvm_passes.cpp
> +++ b/backend/src/llvm/llvm_passes.cpp
> @@ -346,9 +346,11 @@ namespace gbe
>            }
>          }
>  
> -        BinaryOperator* tmpMul = 
> -          BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
> -              "", GEPInst);
> +        Value* tmpMul = operand;
> +        if (size != 1) {
> +          tmpMul = BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
> +                                         "", GEPInst);
> +        }
>          currentAddrInst = 
>            BinaryOperator::Create(Instruction::Add, currentAddrInst, tmpMul,
>                "", GEPInst);
> -- 
> 1.7.10.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Thu Mar 12 01:21:36 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Thu, 12 Mar 2015 16:21:36 +0800
Subject: [Beignet] [PATCH 1/2] Backend: Add the logic to handle uniform
 src for BSwap Gen8.
In-Reply-To: <1425971077-5781-1-git-send-email-junyan.he@inbox.com>
References: <1425971077-5781-1-git-send-email-junyan.he@inbox.com>
Message-ID: <20150312082135.GB21732@ivb-gt2-rev4>

The patchset LGTM, will push latter.

Thanks,
Zhigang Gong.

On Tue, Mar 10, 2015 at 03:04:37PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/backend/gen8_context.cpp | 90 +++++++++++++++++++++++++-----------
>  1 file changed, 62 insertions(+), 28 deletions(-)
> 
> diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
> index 0d4a40e..3f57cf6 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -117,7 +117,8 @@ namespace gbe
>              }
>            } else {
>              if (src.type == GEN_TYPE_UD || src.type == GEN_TYPE_D) {
> -              GBE_ASSERT(src.subnr == 0);
> +              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
> +              GBE_ASSERT(uniform_src || src.subnr == 0);
>                GBE_ASSERT(dst.subnr == 0);
>                GBE_ASSERT(tmp.subnr == 0);
>                GBE_ASSERT(start_addr >= 0);
> @@ -125,18 +126,33 @@ namespace gbe
>                new_a0[1] = start_addr + 2;
>                new_a0[2] = start_addr + 1;
>                new_a0[3] = start_addr;
> -              new_a0[4] = start_addr + 7;
> -              new_a0[5] = start_addr + 6;
> -              new_a0[6] = start_addr + 5;
> -              new_a0[7] = start_addr + 4;
> -              new_a0[8] = start_addr + 11;
> -              new_a0[9] = start_addr + 10;
> -              new_a0[10] = start_addr + 9;
> -              new_a0[11] = start_addr + 8;
> -              new_a0[12] = start_addr + 15;
> -              new_a0[13] = start_addr + 14;
> -              new_a0[14] = start_addr + 13;
> -              new_a0[15] = start_addr + 12;
> +              if (!uniform_src) {
> +                new_a0[4] = start_addr + 7;
> +                new_a0[5] = start_addr + 6;
> +                new_a0[6] = start_addr + 5;
> +                new_a0[7] = start_addr + 4;
> +                new_a0[8] = start_addr + 11;
> +                new_a0[9] = start_addr + 10;
> +                new_a0[10] = start_addr + 9;
> +                new_a0[11] = start_addr + 8;
> +                new_a0[12] = start_addr + 15;
> +                new_a0[13] = start_addr + 14;
> +                new_a0[14] = start_addr + 13;
> +                new_a0[15] = start_addr + 12;
> +              } else {
> +                new_a0[4] = start_addr + 3;
> +                new_a0[5] = start_addr + 2;
> +                new_a0[6] = start_addr + 1;
> +                new_a0[7] = start_addr;
> +                new_a0[8] = start_addr + 3;
> +                new_a0[9] = start_addr + 2;
> +                new_a0[10] = start_addr + 1;
> +                new_a0[11] = start_addr;
> +                new_a0[12] = start_addr + 3;
> +                new_a0[13] = start_addr + 2;
> +                new_a0[14] = start_addr + 1;
> +                new_a0[15] = start_addr;
> +              }
>                this->setA0Content(new_a0, 48);
>  
>                p->push();
> @@ -158,26 +174,44 @@ namespace gbe
>  
>                p->MOV(dst, tmp);
>              } else if (src.type == GEN_TYPE_UW || src.type == GEN_TYPE_W) {
> -              GBE_ASSERT(src.subnr == 0 || src.subnr == 16);
> +              bool uniform_src = (src.hstride == GEN_HORIZONTAL_STRIDE_0);
> +              GBE_ASSERT(uniform_src || src.subnr == 0 || src.subnr == 16);
>                GBE_ASSERT(dst.subnr == 0 || dst.subnr == 16);
>                GBE_ASSERT(tmp.subnr == 0 || tmp.subnr == 16);
>                GBE_ASSERT(start_addr >= 0);
>                new_a0[0] = start_addr + 1;
>                new_a0[1] = start_addr;
> -              new_a0[2] = start_addr + 3;
> -              new_a0[3] = start_addr + 2;
> -              new_a0[4] = start_addr + 5;
> -              new_a0[5] = start_addr + 4;
> -              new_a0[6] = start_addr + 7;
> -              new_a0[7] = start_addr + 6;
> -              new_a0[8] = start_addr + 9;
> -              new_a0[9] = start_addr + 8;
> -              new_a0[10] = start_addr + 11;
> -              new_a0[11] = start_addr + 10;
> -              new_a0[12] = start_addr + 13;
> -              new_a0[13] = start_addr + 12;
> -              new_a0[14] = start_addr + 15;
> -              new_a0[15] = start_addr + 14;
> +              if (!uniform_src) {
> +                new_a0[2] = start_addr + 3;
> +                new_a0[3] = start_addr + 2;
> +                new_a0[4] = start_addr + 5;
> +                new_a0[5] = start_addr + 4;
> +                new_a0[6] = start_addr + 7;
> +                new_a0[7] = start_addr + 6;
> +                new_a0[8] = start_addr + 9;
> +                new_a0[9] = start_addr + 8;
> +                new_a0[10] = start_addr + 11;
> +                new_a0[11] = start_addr + 10;
> +                new_a0[12] = start_addr + 13;
> +                new_a0[13] = start_addr + 12;
> +                new_a0[14] = start_addr + 15;
> +                new_a0[15] = start_addr + 14;
> +              } else {
> +                new_a0[2] = start_addr + 1;
> +                new_a0[3] = start_addr;
> +                new_a0[4] = start_addr + 1;
> +                new_a0[5] = start_addr;
> +                new_a0[6] = start_addr + 1;
> +                new_a0[7] = start_addr;
> +                new_a0[8] = start_addr + 1;
> +                new_a0[9] = start_addr;
> +                new_a0[10] = start_addr + 1;
> +                new_a0[11] = start_addr;
> +                new_a0[12] = start_addr + 1;
> +                new_a0[13] = start_addr;
> +                new_a0[14] = start_addr + 1;
> +                new_a0[15] = start_addr;
> +              }
>                this->setA0Content(new_a0, 48);
>  
>                p->push();
> -- 
> 2.1.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Thu Mar 12 01:22:34 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Thu, 12 Mar 2015 16:22:34 +0800
Subject: [Beignet] [PATCH 1/7] replace fabs with llvm intrinsic.
In-Reply-To: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
References: <1425967188-22075-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <20150312082234.GC21732@ivb-gt2-rev4>

The whole patchset LGTM, will push latter, thanks.

On Tue, Mar 10, 2015 at 01:59:42PM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> translate native fabs to llvm.fabs for fast path.
> 
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/libocl/tmpl/ocl_math.tmpl.cl   | 2 +-
>  backend/src/llvm/llvm_gen_backend.cpp      | 2 --
>  backend/src/llvm/llvm_gen_ocl_function.hxx | 1 -
>  3 files changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> index 2ed7b31..681e70c 100644
> --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> @@ -23,7 +23,7 @@
>  
>  extern constant int __ocl_math_fastpath_flag;
>  
> -PURE CONST float __gen_ocl_fabs(float x);
> +CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
>  CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
>  CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
>  CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index c0ff1d1..a42ee40 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2673,7 +2673,6 @@ namespace gbe
>        case GEN_OCL_POW:
>        case GEN_OCL_RCP:
>        case GEN_OCL_ABS:
> -      case GEN_OCL_FABS:
>        case GEN_OCL_RNDZ:
>        case GEN_OCL_RNDE:
>        case GEN_OCL_RNDU:
> @@ -3077,7 +3076,6 @@ namespace gbe
>            }
>            case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
>            case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
> -          case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
>            case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
>            case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
>            case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 45358d0..8e37df9 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -19,7 +19,6 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
>  DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
>  
>  // Math function
> -DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
>  DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
>  DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
>  DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From jeff.mcgee at intel.com  Thu Mar 12 13:38:55 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Thu, 12 Mar 2015 13:38:55 -0700
Subject: [Beignet] [PATCH i-g-t v2] tests/core_getparams: Create new test
	core_getparams
In-Reply-To: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

New test core_getparams consists of 2 subtests, each one testing
the ability of userspace to query the correct value of a GT config
attribute: subslice total or EU total. drm/i915 implementation of
these queries is required for Cherryview and Gen9+ devices (non-
simulated).

v2: Duplicate small amount of new libdrm functionality to avoid
    bumping libdrm version requirement (Daniel). Convert some
    igt_asserts to the appropriate comparison variants. Add a
    test description.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 tests/.gitignore       |   1 +
 tests/Makefile.sources |   1 +
 tests/core_getparams.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 tests/core_getparams.c

diff --git a/tests/.gitignore b/tests/.gitignore
index 426cc67..c742308 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,6 +1,7 @@
 # Please keep sorted alphabetically
 core_get_client_auth
 core_getclient
+core_getparams
 core_getstats
 core_getversion
 drm_import_export
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 51e8376..999c8f8 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
 
 TESTS_progs_M = \
 	core_get_client_auth \
+	core_getparams \
 	drv_suspend \
 	drv_hangman \
 	gem_bad_reloc \
diff --git a/tests/core_getparams.c b/tests/core_getparams.c
new file mode 100644
index 0000000..2855d06
--- /dev/null
+++ b/tests/core_getparams.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jeff McGee <jeff.mcgee at intel.com>
+ *
+ */
+
+#include <unistd.h>
+#include <errno.h>
+#include <xf86drm.h>
+#include <i915_drm.h>
+#include "drmtest.h"
+#include "intel_chipset.h"
+#include "intel_bufmgr.h"
+
+IGT_TEST_DESCRIPTION("Tests the export of parameters via DRM_IOCTL_I915_GETPARAM\n");
+
+int drm_fd;
+int devid;
+
+static void
+init(void)
+{
+	drm_fd = drm_open_any();
+	devid = intel_get_drm_devid(drm_fd);
+}
+
+static void
+deinit(void)
+{
+	close(drm_fd);
+}
+
+#define LOCAL_I915_PARAM_SUBSLICE_TOTAL	33
+#define LOCAL_I915_PARAM_EU_TOTAL	34
+
+static int
+getparam(int param, int *value)
+{
+	drm_i915_getparam_t gp;
+	int ret;
+
+	memset(&gp, 0, sizeof(gp));
+	gp.value = value;
+	gp.param = param;
+	ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
+	if (ret)
+		return -errno;
+
+	return 0;
+}
+
+static void
+subslice_total(void)
+{
+	unsigned int subslice_total = 0;
+	int ret;
+
+	ret = getparam(I915_PARAM_SUBSLICE_TOTAL, (int*)&subslice_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert_eq(ret, -ENODEV);
+			igt_info("subslice total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert_neq(ret, EINVAL); /* request not recognized? */
+			igt_assert_neq(ret, ENODEV); /* device not supported? */
+			igt_assert_eq(ret, 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert_neq(subslice_total, 0);
+		igt_info("subslice total: %u\n", subslice_total);
+	}
+}
+
+static void
+eu_total(void)
+{
+	unsigned int eu_total = 0;
+	int ret;
+
+	ret = getparam(I915_PARAM_EU_TOTAL, (int*)&eu_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert_eq(ret, -ENODEV);
+			igt_info("EU total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert_neq(ret, EINVAL); /* request not recognized? */
+			igt_assert_neq(ret, ENODEV); /* device not supported? */
+			igt_assert_eq(ret, 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert_neq(eu_total, 0);
+		igt_info("EU total: %u\n", eu_total);
+	}
+}
+
+static void
+exit_handler(int sig)
+{
+	deinit();
+}
+
+igt_main
+{
+	igt_fixture {
+		igt_install_exit_handler(exit_handler);
+		init();
+	}
+
+	igt_subtest("subslice-total")
+		subslice_total();
+
+	igt_subtest("eu-total")
+		eu_total();
+}
-- 
2.3.0


From jeff.mcgee at intel.com  Thu Mar 12 13:42:45 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Thu, 12 Mar 2015 13:42:45 -0700
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t 2/2] configure: Bump
 required libdrm version to 2.4.60
In-Reply-To: <20150311072136.GW3800@phenom.ffwll.local>
References: <1425943172-28040-1-git-send-email-jeff.mcgee@intel.com>
 <1425944462-28293-1-git-send-email-jeff.mcgee@intel.com>
 <20150310073730.GA3800@phenom.ffwll.local>
 <20150310165915.GG3263@jeffdesk>
 <CAF6AEGvzHihd72krmRbMR_dXHA3YdsmZyGu4+MSMvuu-52E4DQ@mail.gmail.com>
 <20150310184703.GS3800@phenom.ffwll.local>
 <20150310200644.GI3263@jeffdesk>
 <20150311072136.GW3800@phenom.ffwll.local>
Message-ID: <20150312204245.GJ3263@jeffdesk>

On Wed, Mar 11, 2015 at 08:21:36AM +0100, Daniel Vetter wrote:
> On Tue, Mar 10, 2015 at 01:06:44PM -0700, Jeff McGee wrote:
> > On Tue, Mar 10, 2015 at 07:47:03PM +0100, Daniel Vetter wrote:
> > > On Tue, Mar 10, 2015 at 01:58:52PM -0400, Rob Clark wrote:
> > > > On Tue, Mar 10, 2015 at 12:59 PM, Jeff McGee <jeff.mcgee at intel.com> wrote:
> > > > > On Tue, Mar 10, 2015 at 08:37:30AM +0100, Daniel Vetter wrote:
> > > > >> On Mon, Mar 09, 2015 at 04:41:02PM -0700, jeff.mcgee at intel.com wrote:
> > > > >> > From: Jeff McGee <jeff.mcgee at intel.com>
> > > > >> >
> > > > >> > tests/core_getparams needs the new libdrm interfaces for
> > > > >> > querying subslice and EU counts.
> > > > >> >
> > > > >> > For: VIZ-4636
> > > > >> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > > > >> > ---
> > > > >> >  configure.ac | 2 +-
> > > > >> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > >> >
> > > > >> > diff --git a/configure.ac b/configure.ac
> > > > >> > index 16d6a2e..88a1c3d 100644
> > > > >> > --- a/configure.ac
> > > > >> > +++ b/configure.ac
> > > > >> > @@ -82,7 +82,7 @@ if test "x$GCC" = "xyes"; then
> > > > >> >  fi
> > > > >> >  AC_SUBST(ASSEMBLER_WARN_CFLAGS)
> > > > >> >
> > > > >> > -PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.52 libdrm])
> > > > >> > +PKG_CHECK_MODULES(DRM, [libdrm_intel >= 2.4.60 libdrm])
> > > > >>
> > > > >> Please don't and instead copypaste the new structs/defines with a local_
> > > > >> prefix like we do it for all the other new igt testcases. Forcing libdrm
> > > > >> to get updated for igt all the time can get annoying fast.
> > > > >> -Daniel
> > > > >>
> > > > > In this case I'm trying to exercise new API functions in libdrm which
> > > > > wrap the GETPARAM ioctl. Would you rather me bypass the wrapper to
> > > > > avoid requiring updated libdrm? I can do that, but it fails to test the
> > > > > complete path that client would use.
> > > > 
> > > > 
> > > > Am I missing something, or does 2.4.60 not exist yet?
> > > > 
> > > > That said dependency bumps for igt seem like less of an issue than
> > > > dependency bumps for mesa..  I mean if you are using igt you are
> > > > probably on the latest anyways..  I'm not sure why Daniel is so
> > > > concerned about that..
> > > > 
> > > > (but dependency bumps to something that doesn't exist yet should
> > > > perhaps be avoided)
> > > 
> > > I'd like to avoid massive depency loops for igt tests so that I can merge
> > > the testcase right when the patches land in -nightly. Otherwise there's
> > > always a small delay involved where regression can creep in. Also if I
> > > have to update libdrm every time I update igt that's annoying since
> > > without that I don't have to install/update anything at all - I run igt
> > > in-place. And we've used the LOCAL_ prefixes for pretty much every abi
> > > addition in igt tests thus far.
> > > -Daniel
> > 
> > I understand that and it certainly makes sense when libdrm is only
> > providing defines or structs. But as I said, in this case there is
> > code in libdrm (the wrapper) that we could test as part of the
> > complete path. Are you recommending that I implement duplicate
> > wrapper functions in igt with the local prefix?
> 
> Sorry I totally didn't realize that. Generally we don't have a lot of igt
> testcase for libdrm really, imo it's usually simpler to just add the
> interface to each part. Since this is such a simple one there's no need to
> have a low-level test and the libdrm test on top, but that's what I'd do
> if there's something worth testing in libdrm. Because for complex
> functionality I really want to get the bare-metal tests in together with
> the kernel part. Stalling for libdrm release could take longer.
> 
> And yes, personally I'd just have open-coded the getparam call here in
> igt, but that's just a bikeshed.
> -Daniel
> -- 

Scratch this patch. I just sent v2 of the previous patch that removes
the dependency on libdrm update.
-Jeff

From jeff.mcgee at intel.com  Thu Mar 12 13:52:40 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Thu, 12 Mar 2015 13:52:40 -0700
Subject: [Beignet] [PATCH 2/2 v2] Query the driver directly for
	compute	units and subslice
In-Reply-To: <000501d05c69$7eb113c0$7c133b40$@linux.intel.com>
References: <1425339759-19027-2-git-send-email-jeff.mcgee@intel.com>
 <1425944158-28223-1-git-send-email-jeff.mcgee@intel.com>
 <000501d05c69$7eb113c0$7c133b40$@linux.intel.com>
Message-ID: <20150312205240.GK3263@jeffdesk>

On Thu, Mar 12, 2015 at 10:08:54AM +0800, Zhigang Gong wrote:
> LGTM,
> 
> Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> 
> Thanks.
> 

Thanks for the review, Zhigang.

With beignet portion reviewed, review should be able to proceed for
the i915, libdrm, and igt parts. These are all quite simple. Can someone(s)
please review?

-Jeff

From jeff.mcgee at intel.com  Thu Mar 12 17:26:25 2015
From: jeff.mcgee at intel.com (jeff.mcgee at intel.com)
Date: Thu, 12 Mar 2015 17:26:25 -0700
Subject: [Beignet] [PATCH i-g-t v3] tests/core_getparams: Create new test
	core_getparams
In-Reply-To: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>

From: Jeff McGee <jeff.mcgee at intel.com>

New test core_getparams consists of 2 subtests, each one testing
the ability of userspace to query the correct value of a GT config
attribute: subslice total or EU total. drm/i915 implementation of
these queries is required for Cherryview and Gen9+ devices (non-
simulated).

v2: Duplicate small amount of new libdrm functionality to avoid
    bumping libdrm version requirement (Daniel). Convert some
    igt_asserts to the appropriate comparison variants. Add a
    test description.
v3: Actually use the LOCAL GETPARAM defines. Otherwise can't build
    against older libdrm as intended by v2.

For: VIZ-4636
Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
---
 tests/.gitignore       |   1 +
 tests/Makefile.sources |   1 +
 tests/core_getparams.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 tests/core_getparams.c

diff --git a/tests/.gitignore b/tests/.gitignore
index 426cc67..c742308 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,6 +1,7 @@
 # Please keep sorted alphabetically
 core_get_client_auth
 core_getclient
+core_getparams
 core_getstats
 core_getversion
 drm_import_export
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 51e8376..999c8f8 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
 
 TESTS_progs_M = \
 	core_get_client_auth \
+	core_getparams \
 	drv_suspend \
 	drv_hangman \
 	gem_bad_reloc \
diff --git a/tests/core_getparams.c b/tests/core_getparams.c
new file mode 100644
index 0000000..2855d06
--- /dev/null
+++ b/tests/core_getparams.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jeff McGee <jeff.mcgee at intel.com>
+ *
+ */
+
+#include <unistd.h>
+#include <errno.h>
+#include <xf86drm.h>
+#include <i915_drm.h>
+#include "drmtest.h"
+#include "intel_chipset.h"
+#include "intel_bufmgr.h"
+
+IGT_TEST_DESCRIPTION("Tests the export of parameters via DRM_IOCTL_I915_GETPARAM\n");
+
+int drm_fd;
+int devid;
+
+static void
+init(void)
+{
+	drm_fd = drm_open_any();
+	devid = intel_get_drm_devid(drm_fd);
+}
+
+static void
+deinit(void)
+{
+	close(drm_fd);
+}
+
+#define LOCAL_I915_PARAM_SUBSLICE_TOTAL	33
+#define LOCAL_I915_PARAM_EU_TOTAL	34
+
+static int
+getparam(int param, int *value)
+{
+	drm_i915_getparam_t gp;
+	int ret;
+
+	memset(&gp, 0, sizeof(gp));
+	gp.value = value;
+	gp.param = param;
+	ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
+	if (ret)
+		return -errno;
+
+	return 0;
+}
+
+static void
+subslice_total(void)
+{
+	unsigned int subslice_total = 0;
+	int ret;
+
+	ret = getparam(LOCAL_I915_PARAM_SUBSLICE_TOTAL, (int*)&subslice_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert_eq(ret, -ENODEV);
+			igt_info("subslice total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert_neq(ret, EINVAL); /* request not recognized? */
+			igt_assert_neq(ret, ENODEV); /* device not supported? */
+			igt_assert_eq(ret, 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert_neq(subslice_total, 0);
+		igt_info("subslice total: %u\n", subslice_total);
+	}
+}
+
+static void
+eu_total(void)
+{
+	unsigned int eu_total = 0;
+	int ret;
+
+	ret = getparam(LOCAL_I915_PARAM_EU_TOTAL, (int*)&eu_total);
+
+	if (ret) {
+		/*
+		 * These devices are not required to implement the
+		 * interface. If they do not, -ENODEV must be returned.
+		*/
+		if ((intel_gen(devid) < 8) ||
+		    IS_BROADWELL(devid) ||
+		    igt_run_in_simulation()) {
+			igt_assert_eq(ret, -ENODEV);
+			igt_info("EU total: unknown\n");
+		/*
+		 * All other devices must implement the interface, so
+		 * fail them if we are here.
+		*/
+		} else {
+			igt_assert_neq(ret, EINVAL); /* request not recognized? */
+			igt_assert_neq(ret, ENODEV); /* device not supported? */
+			igt_assert_eq(ret, 0); /* other error? */
+		}
+	} else {
+		/*
+		 * On success, just make sure the returned count value is
+		 * non-zero. The validity of the count value for the given
+		 * device is not checked.
+		*/
+		igt_assert_neq(eu_total, 0);
+		igt_info("EU total: %u\n", eu_total);
+	}
+}
+
+static void
+exit_handler(int sig)
+{
+	deinit();
+}
+
+igt_main
+{
+	igt_fixture {
+		igt_install_exit_handler(exit_handler);
+		init();
+	}
+
+	igt_subtest("subslice-total")
+		subslice_total();
+
+	igt_subtest("eu-total")
+		eu_total();
+}
-- 
2.3.0


From xionghu.luo at intel.com  Thu Mar 12 23:03:26 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Fri, 13 Mar 2015 14:03:26 +0800
Subject: [Beignet] [PATCH] diasble the SPIR case for llvm before than 3.3.
Message-ID: <1426226606-20855-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

the clang 3.5 will call CallGraphSCCPass to add attribute
"Attribute::ReadOnly" for these parameters only reads memeory, but this
attribute is not supported in the VerifierPass of llvm 3.3. This is a bug
of llvm 3.3.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_bitcode_link.cpp | 3 +++
 utests/CMakeLists.txt                  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 17248c0..4a8985f 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -237,6 +237,9 @@ namespace gbe
       kernels.push_back(f);
     }
 
+    if(clonedLib->getDataLayout() != mod->getDataLayout())
+      mod->setDataLayout(clonedLib->getDataLayout());
+
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
     char* errorMsg;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 06baa68..317e27a 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -192,7 +192,6 @@ set (utests_sources
   compiler_time_stamp.cpp
   compiler_double_precision.cpp
   load_program_from_gen_bin.cpp
-  load_program_from_spir.cpp
   get_arg_info.cpp
   profiling_exec.cpp
   enqueue_copy_buf.cpp
@@ -213,6 +212,7 @@ set (utests_sources
 if (LLVM_VERSION_NODOT VERSION_GREATER 34)
   SET(utests_sources
       ${utests_sources}
+      load_program_from_spir.cpp
       compiler_overflow.cpp)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
 
-- 
1.9.1


From zhigang.gong at linux.intel.com  Thu Mar 12 23:48:56 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Fri, 13 Mar 2015 14:48:56 +0800
Subject: [Beignet] [PATCH] diasble the SPIR case for llvm before than
	3.3.
In-Reply-To: <1426226606-20855-1-git-send-email-xionghu.luo@intel.com>
References: <1426226606-20855-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <00e101d05d59$c7bed7f0$573c87d0$@linux.intel.com>

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> xionghu.luo at intel.com
> Sent: Friday, March 13, 2015 2:03 PM
> To: beignet at lists.freedesktop.org
> Cc: xionghu.luo at intel.com
> Subject: [Beignet] [PATCH] diasble the SPIR case for llvm before than 3.3.
> 
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> the clang 3.5 will call CallGraphSCCPass to add attribute "Attribute::ReadOnly"
> for these parameters only reads memeory, but this attribute is not supported
> in the VerifierPass of llvm 3.3. This is a bug of llvm 3.3.


> 
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/llvm/llvm_bitcode_link.cpp | 3 +++
>  utests/CMakeLists.txt                  | 2 +-
>  2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/backend/src/llvm/llvm_bitcode_link.cpp
> b/backend/src/llvm/llvm_bitcode_link.cpp
> index 17248c0..4a8985f 100644
> --- a/backend/src/llvm/llvm_bitcode_link.cpp
> +++ b/backend/src/llvm/llvm_bitcode_link.cpp
> @@ -237,6 +237,9 @@ namespace gbe
>        kernels.push_back(f);
>      }
> 
> +    if(clonedLib->getDataLayout() != mod->getDataLayout())
> +      mod->setDataLayout(clonedLib->getDataLayout());
> +

The above diff should be another patch and need a commit log.

>      /* We use beignet's bitcode as dst because it will have a lot of
>         lazy functions which will not be loaded. */
>      char* errorMsg;
> diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index
> 06baa68..317e27a 100644
> --- a/utests/CMakeLists.txt
> +++ b/utests/CMakeLists.txt
> @@ -192,7 +192,6 @@ set (utests_sources
>    compiler_time_stamp.cpp
>    compiler_double_precision.cpp
>    load_program_from_gen_bin.cpp
> -  load_program_from_spir.cpp
>    get_arg_info.cpp
>    profiling_exec.cpp
>    enqueue_copy_buf.cpp
> @@ -213,6 +212,7 @@ set (utests_sources
>  if (LLVM_VERSION_NODOT VERSION_GREATER 34)
>    SET(utests_sources
>        ${utests_sources}
> +      load_program_from_spir.cpp
A better way is to disable the spir extension for llvm version before 3.5.
Then in the load_program_from_spir case, you could query the extension firstly,
if there is no spir extension, then just do nothing.

Thanks,
Zhigang Gong.

>        compiler_overflow.cpp)
>  endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
> 
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From xionghu.luo at intel.com  Fri Mar 13 01:29:45 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Fri, 13 Mar 2015 16:29:45 +0800
Subject: [Beignet] [PATCH 1/3] reset the SPIR target datalayout.
Message-ID: <1426235387-21346-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_bitcode_link.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 17248c0..3bf9613 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -237,6 +237,10 @@ namespace gbe
       kernels.push_back(f);
     }
 
+    /* the SPIR binary datalayout maybe different with beignet's bitcode */
+    if(clonedLib->getDataLayout() != mod->getDataLayout())
+      mod->setDataLayout(clonedLib->getDataLayout());
+
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
     char* errorMsg;
-- 
1.9.1


From xionghu.luo at intel.com  Fri Mar 13 01:32:39 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Fri, 13 Mar 2015 16:32:39 +0800
Subject: [Beignet] [PATCH V2 1/3] reset the SPIR target datalayout.
Message-ID: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

v2: split to a seperate patch.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_bitcode_link.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 17248c0..3bf9613 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -237,6 +237,10 @@ namespace gbe
       kernels.push_back(f);
     }
 
+    /* the SPIR binary datalayout maybe different with beignet's bitcode */
+    if(clonedLib->getDataLayout() != mod->getDataLayout())
+      mod->setDataLayout(clonedLib->getDataLayout());
+
     /* We use beignet's bitcode as dst because it will have a lot of
        lazy functions which will not be loaded. */
     char* errorMsg;
-- 
1.9.1


From xionghu.luo at intel.com  Fri Mar 13 01:32:40 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Fri, 13 Mar 2015 16:32:40 +0800
Subject: [Beignet] [PATCH V2 2/3] only support spir extension for beignet
	build with llvm 3.5 or later.
In-Reply-To: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>
References: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1426235561-21847-2-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

the clang 3.5 will call CallGraphSCCPass to add attribute "Attribute::ReadOnly"
for these parameters only reads memeory, but this attribute is not
supported in the VerifierPass of llvm 3.3. This is a bug of llvm 3.3.

v2: disable this extension in runtime for old llvm.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 src/cl_extensions.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index cea2dd8..adcf82e 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -1,3 +1,4 @@
+#include "llvm/Config/llvm-config.h"
 #ifdef HAS_EGL
 #include "EGL/egl.h"
 #include "EGL/eglext.h"
@@ -37,8 +38,10 @@ void check_opt1_extension(cl_extensions_t *extensions)
   {
     if (id == EXT_ID(khr_icd))
       extensions->extensions[id].base.ext_enabled = 1;
+#if  LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
     if (id == EXT_ID(khr_spir))
       extensions->extensions[id].base.ext_enabled = 1;
+#endif
   }
 }
 
-- 
1.9.1


From xionghu.luo at intel.com  Fri Mar 13 01:32:41 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Fri, 13 Mar 2015 16:32:41 +0800
Subject: [Beignet] [PATCH V2 3/3] simple return if spir extension not
	supported.
In-Reply-To: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>
References: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1426235561-21847-3-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 utests/load_program_from_spir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utests/load_program_from_spir.cpp b/utests/load_program_from_spir.cpp
index 3e4534c..f03e7c2 100644
--- a/utests/load_program_from_spir.cpp
+++ b/utests/load_program_from_spir.cpp
@@ -21,7 +21,7 @@ static void test_load_program_from_spir(void)
       extensionStr = std::string(&param_value.front(), param_value_size-1);
 
     if (!std::strstr(extensionStr.c_str(), "cl_khr_spir")) {
-      OCL_ASSERT(0);
+      return;
     }
 
     const size_t n = 16;
-- 
1.9.1


From daniel at ffwll.ch  Fri Mar 13 01:58:37 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 09:58:37 +0100
Subject: [Beignet] [PATCH i-g-t v3] tests/core_getparams: Create new
 test core_getparams
In-Reply-To: <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
 <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150313085837.GY3800@phenom.ffwll.local>

On Thu, Mar 12, 2015 at 05:26:25PM -0700, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> New test core_getparams consists of 2 subtests, each one testing
> the ability of userspace to query the correct value of a GT config
> attribute: subslice total or EU total. drm/i915 implementation of
> these queries is required for Cherryview and Gen9+ devices (non-
> simulated).
> 
> v2: Duplicate small amount of new libdrm functionality to avoid
>     bumping libdrm version requirement (Daniel). Convert some
>     igt_asserts to the appropriate comparison variants. Add a
>     test description.
> v3: Actually use the LOCAL GETPARAM defines. Otherwise can't build
>     against older libdrm as intended by v2.
> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  tests/.gitignore       |   1 +
>  tests/Makefile.sources |   1 +
>  tests/core_getparams.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 169 insertions(+)
>  create mode 100644 tests/core_getparams.c
> 
> diff --git a/tests/.gitignore b/tests/.gitignore
> index 426cc67..c742308 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -1,6 +1,7 @@
>  # Please keep sorted alphabetically
>  core_get_client_auth
>  core_getclient
> +core_getparams
>  core_getstats
>  core_getversion
>  drm_import_export
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 51e8376..999c8f8 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
>  
>  TESTS_progs_M = \
>  	core_get_client_auth \
> +	core_getparams \

Sorry I missed this little one: core_ is for drm core stuff shared by all
drivers (i.e. even those not supporting gem or kms). getparam is an i915
specific thing, so imo better to have drv_ as a prefix.

For our naming conventions and igt test sections see the docs at

http://people.freedesktop.org/~danvet/igt/

For review I think signing up someone from the beignet team might be best.
Cheers, Daniel

>  	drv_suspend \
>  	drv_hangman \
>  	gem_bad_reloc \
> diff --git a/tests/core_getparams.c b/tests/core_getparams.c
> new file mode 100644
> index 0000000..2855d06
> --- /dev/null
> +++ b/tests/core_getparams.c
> @@ -0,0 +1,167 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *    Jeff McGee <jeff.mcgee at intel.com>
> + *
> + */
> +
> +#include <unistd.h>
> +#include <errno.h>
> +#include <xf86drm.h>
> +#include <i915_drm.h>
> +#include "drmtest.h"
> +#include "intel_chipset.h"
> +#include "intel_bufmgr.h"
> +
> +IGT_TEST_DESCRIPTION("Tests the export of parameters via DRM_IOCTL_I915_GETPARAM\n");
> +
> +int drm_fd;
> +int devid;
> +
> +static void
> +init(void)
> +{
> +	drm_fd = drm_open_any();
> +	devid = intel_get_drm_devid(drm_fd);
> +}
> +
> +static void
> +deinit(void)
> +{
> +	close(drm_fd);
> +}
> +
> +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL	33
> +#define LOCAL_I915_PARAM_EU_TOTAL	34
> +
> +static int
> +getparam(int param, int *value)
> +{
> +	drm_i915_getparam_t gp;
> +	int ret;
> +
> +	memset(&gp, 0, sizeof(gp));
> +	gp.value = value;
> +	gp.param = param;
> +	ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
> +	if (ret)
> +		return -errno;
> +
> +	return 0;
> +}
> +
> +static void
> +subslice_total(void)
> +{
> +	unsigned int subslice_total = 0;
> +	int ret;
> +
> +	ret = getparam(LOCAL_I915_PARAM_SUBSLICE_TOTAL, (int*)&subslice_total);
> +
> +	if (ret) {
> +		/*
> +		 * These devices are not required to implement the
> +		 * interface. If they do not, -ENODEV must be returned.
> +		*/
> +		if ((intel_gen(devid) < 8) ||
> +		    IS_BROADWELL(devid) ||
> +		    igt_run_in_simulation()) {
> +			igt_assert_eq(ret, -ENODEV);
> +			igt_info("subslice total: unknown\n");
> +		/*
> +		 * All other devices must implement the interface, so
> +		 * fail them if we are here.
> +		*/
> +		} else {
> +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> +			igt_assert_eq(ret, 0); /* other error? */
> +		}
> +	} else {
> +		/*
> +		 * On success, just make sure the returned count value is
> +		 * non-zero. The validity of the count value for the given
> +		 * device is not checked.
> +		*/
> +		igt_assert_neq(subslice_total, 0);
> +		igt_info("subslice total: %u\n", subslice_total);
> +	}
> +}
> +
> +static void
> +eu_total(void)
> +{
> +	unsigned int eu_total = 0;
> +	int ret;
> +
> +	ret = getparam(LOCAL_I915_PARAM_EU_TOTAL, (int*)&eu_total);
> +
> +	if (ret) {
> +		/*
> +		 * These devices are not required to implement the
> +		 * interface. If they do not, -ENODEV must be returned.
> +		*/
> +		if ((intel_gen(devid) < 8) ||
> +		    IS_BROADWELL(devid) ||
> +		    igt_run_in_simulation()) {
> +			igt_assert_eq(ret, -ENODEV);
> +			igt_info("EU total: unknown\n");
> +		/*
> +		 * All other devices must implement the interface, so
> +		 * fail them if we are here.
> +		*/
> +		} else {
> +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> +			igt_assert_eq(ret, 0); /* other error? */
> +		}
> +	} else {
> +		/*
> +		 * On success, just make sure the returned count value is
> +		 * non-zero. The validity of the count value for the given
> +		 * device is not checked.
> +		*/
> +		igt_assert_neq(eu_total, 0);
> +		igt_info("EU total: %u\n", eu_total);
> +	}
> +}
> +
> +static void
> +exit_handler(int sig)
> +{
> +	deinit();
> +}
> +
> +igt_main
> +{
> +	igt_fixture {
> +		igt_install_exit_handler(exit_handler);
> +		init();
> +	}
> +
> +	igt_subtest("subslice-total")
> +		subslice_total();
> +
> +	igt_subtest("eu-total")
> +		eu_total();
> +}
> -- 
> 2.3.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From chris at chris-wilson.co.uk  Fri Mar 13 02:18:31 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Fri, 13 Mar 2015 09:18:31 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <5502A9A0.1000209@intel.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
Message-ID: <20150313091831.GD24908@nuc-i3427.alporthouse.com>

On Fri, Mar 13, 2015 at 11:10:56AM +0200, David Weinehall wrote:
> On 2015-03-09 14:02, Chris Wilson wrote:
> >On Mon, Mar 09, 2015 at 02:34:46AM +0000, Zou, Nanhai wrote:
> >>We don't need MAP_FIXED, we just want to avoid address 0 to be allocated.
> >>
> >>Though I think using MAP_FIXED is overkill, will bring much unnecessary complexity on both kernel and beignet side.
> >>I don't mind if people can provide stable MAP_FIXED patches to resolve this problem a few months or years later.
> >>
> >>At that time, kernel driver can revert the reserve page 0 patch.
> >>Before that reserve page 0 can benefit all the Beignet user without breaking anything.
> >
> >The point is that is becomes ABI. So no the kernel can't just revert it.
> >There is nothing special about address 0 in ether GTT or virtual memory.
> >If you require a special object allocated at address 0, allocate a
> >special object at address 0.
> 
> I've explained the ABI issue in a separate e-mail discussion, and I
> believe that they now fully understand what you meant.
> 
> That said, their main chain of reasoning makes some sense -- there is a
> race condition if we rely on using MAP_FIXED, at least on systems that
> do not support ppgtt. Ending up in a situation where opencl
> applications work on other hw, but fails when run on an i915-system
> would, at least in my opinion, not be ideal, no matter if it's due
> to an unfortunate design.
> 
> *If* a MAP_FIXED solution is decided upon, how can userland be sure
> that the GTT page mapped to 0 is indeed usable as the NULL pointer?
> ON a PPGTT system that would be easy enough -- it's per process, so
> we'll be the only process allocating a page at 0, but if allocations
> use a global address space that won't be possible to guarantee.

It's simple, userspace has no control over allocation of the GGTT.
Espcially something like address 0. They can make a request to have
their object at a certain location, but the kernel is entirely likely to
reject their request because it is pinned to hardware.

> I realise that the first submitted patch didn't cover the GTT case,
> since the first indication I got was that not only was there a
> special case for GTT to need page 0 for other things, but also that
> this was "good enough" for opencl, but it seems that a full solution
> would be needed.

Exactly.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From daniel at ffwll.ch  Fri Mar 13 02:27:38 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 10:27:38 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <5502A9A0.1000209@intel.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
Message-ID: <20150313092738.GD3800@phenom.ffwll.local>

On Fri, Mar 13, 2015 at 11:10:56AM +0200, David Weinehall wrote:
> On 2015-03-09 14:02, Chris Wilson wrote:
> >On Mon, Mar 09, 2015 at 02:34:46AM +0000, Zou, Nanhai wrote:
> >>We don't need MAP_FIXED, we just want to avoid address 0 to be allocated.
> >>
> >>Though I think using MAP_FIXED is overkill, will bring much unnecessary complexity on both kernel and beignet side.
> >>I don't mind if people can provide stable MAP_FIXED patches to resolve this problem a few months or years later.
> >>
> >>At that time, kernel driver can revert the reserve page 0 patch.
> >>Before that reserve page 0 can benefit all the Beignet user without breaking anything.
> >
> >The point is that is becomes ABI. So no the kernel can't just revert it.
> >There is nothing special about address 0 in ether GTT or virtual memory.
> >If you require a special object allocated at address 0, allocate a
> >special object at address 0.
> 
> I've explained the ABI issue in a separate e-mail discussion, and I believe
> that they now fully understand what you meant.
> 
> That said, their main chain of reasoning makes some sense -- there is a
> race condition if we rely on using MAP_FIXED, at least on systems that
> do not support ppgtt. Ending up in a situation where opencl applications
> work on other hw, but fails when run on an i915-system would, at least in my
> opinion, not be ideal, no matter if it's due to an unfortunate design.
> 
> *If* a MAP_FIXED solution is decided upon, how can userland be sure that the
> GTT page mapped to 0 is indeed usable as the NULL pointer?
> ON a PPGTT system that would be easy enough -- it's per process, so we'll be
> the only process allocating a page at 0, but if allocations
> use a global address space that won't be possible to guarantee.
> 
> I realise that the first submitted patch didn't cover the GTT case, since
> the first indication I got was that not only was there a special case for
> GTT to need page 0 for other things, but also that this was "good enough"
> for opencl, but it seems that a full solution would be needed.
> 
> Since this is a memory area we're talking about it's not uncommon to have
> the 0th page represent the NULL pointer and impossible for applications to
> reserve, so it would hardly be an unusual and inexplicable solution.
> 
> All this said, how do "the other two" (NVidia, ATI) deal with this?
> Implicit NULL-page, explicit MAP_FIXED allocation, or something else?

Afaik there's no opensource ocl implementation yet that needs this. So no
idea what they do.

If supporting systems without full ppgtt is a requirement for you (still
wonky on gen8 a bit, so might be a good strategy) then imo it's the
PIN_BIAS idea I've laid out earlier in this thread. That one will work
everywhere. softpin can unexpectedly fail without full ppgtt if the kernel
decides to put something at a given spot, which imo means we should only
expose it on full ppgtt systems.

And PIN_BIAS should be fairly easy to wire up since the internal logic is
all there already. So "just" needs an execbuf flag, igt test and
appropriate userspace to set that new bit.

> Kind regards, David
> ---------------------------------------------------------------------
> Intel Finland Oy
> Registered Address: PL 281, 00181 Helsinki Business Identity Code: 0357606 -
> 4 Domiciled in Helsinki
> 
> This e-mail and any attachments may contain confidential material for
> the sole use of the intended recipient(s). Any review or distribution
> by others is strictly prohibited. If you are not the intended
> recipient, please contact the sender and delete all copies.

You want to remove this when submitting to public mailing lists. Either
apply for an exception or grab a 2nd mail address for mailing list
communication.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From zhigang.gong at linux.intel.com  Fri Mar 13 02:09:46 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Fri, 13 Mar 2015 17:09:46 +0800
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t v3] tests/core_getparams:
 Create new test core_getparams
In-Reply-To: <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
 <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150313090945.GD21732@ivb-gt2-rev4>

My only concern is about the following macros:

> +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL      33
> +#define LOCAL_I915_PARAM_EU_TOTAL    34

How about to just use the definitons in the kernel header file?
For an example:

  #include <drm/i915_drm.h>

  #ifdef LOCAL_I915_PARAM_SUBSLICE_TOTAL
  //Put all the code into this block.
  #endif

Then we can avoid put the same definitions in different files,
and we can avoid unecessary testing on an old kernel which doesn't
have this kernel interface.

For all the other part, it LGTM.

Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>

Thanks,
Zhigang Gong.

On Thu, Mar 12, 2015 at 05:26:25PM -0700, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> New test core_getparams consists of 2 subtests, each one testing
> the ability of userspace to query the correct value of a GT config
> attribute: subslice total or EU total. drm/i915 implementation of
> these queries is required for Cherryview and Gen9+ devices (non-
> simulated).
> 
> v2: Duplicate small amount of new libdrm functionality to avoid
>     bumping libdrm version requirement (Daniel). Convert some
>     igt_asserts to the appropriate comparison variants. Add a
>     test description.
> v3: Actually use the LOCAL GETPARAM defines. Otherwise can't build
>     against older libdrm as intended by v2.
> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  tests/.gitignore       |   1 +
>  tests/Makefile.sources |   1 +
>  tests/core_getparams.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 169 insertions(+)
>  create mode 100644 tests/core_getparams.c
> 
> diff --git a/tests/.gitignore b/tests/.gitignore
> index 426cc67..c742308 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -1,6 +1,7 @@
>  # Please keep sorted alphabetically
>  core_get_client_auth
>  core_getclient
> +core_getparams
>  core_getstats
>  core_getversion
>  drm_import_export
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 51e8376..999c8f8 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
>  
>  TESTS_progs_M = \
>  	core_get_client_auth \
> +	core_getparams \
>  	drv_suspend \
>  	drv_hangman \
>  	gem_bad_reloc \
> diff --git a/tests/core_getparams.c b/tests/core_getparams.c
> new file mode 100644
> index 0000000..2855d06
> --- /dev/null
> +++ b/tests/core_getparams.c
> @@ -0,0 +1,167 @@
> +/*
> + * Copyright © 2014 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *    Jeff McGee <jeff.mcgee at intel.com>
> + *
> + */
> +
> +#include <unistd.h>
> +#include <errno.h>
> +#include <xf86drm.h>
> +#include <i915_drm.h>
> +#include "drmtest.h"
> +#include "intel_chipset.h"
> +#include "intel_bufmgr.h"
> +
> +IGT_TEST_DESCRIPTION("Tests the export of parameters via DRM_IOCTL_I915_GETPARAM\n");
> +
> +int drm_fd;
> +int devid;
> +
> +static void
> +init(void)
> +{
> +	drm_fd = drm_open_any();
> +	devid = intel_get_drm_devid(drm_fd);
> +}
> +
> +static void
> +deinit(void)
> +{
> +	close(drm_fd);
> +}
> +
> +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL	33
> +#define LOCAL_I915_PARAM_EU_TOTAL	34
> +
> +static int
> +getparam(int param, int *value)
> +{
> +	drm_i915_getparam_t gp;
> +	int ret;
> +
> +	memset(&gp, 0, sizeof(gp));
> +	gp.value = value;
> +	gp.param = param;
> +	ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
> +	if (ret)
> +		return -errno;
> +
> +	return 0;
> +}
> +
> +static void
> +subslice_total(void)
> +{
> +	unsigned int subslice_total = 0;
> +	int ret;
> +
> +	ret = getparam(LOCAL_I915_PARAM_SUBSLICE_TOTAL, (int*)&subslice_total);
> +
> +	if (ret) {
> +		/*
> +		 * These devices are not required to implement the
> +		 * interface. If they do not, -ENODEV must be returned.
> +		*/
> +		if ((intel_gen(devid) < 8) ||
> +		    IS_BROADWELL(devid) ||
> +		    igt_run_in_simulation()) {
> +			igt_assert_eq(ret, -ENODEV);
> +			igt_info("subslice total: unknown\n");
> +		/*
> +		 * All other devices must implement the interface, so
> +		 * fail them if we are here.
> +		*/
> +		} else {
> +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> +			igt_assert_eq(ret, 0); /* other error? */
> +		}
> +	} else {
> +		/*
> +		 * On success, just make sure the returned count value is
> +		 * non-zero. The validity of the count value for the given
> +		 * device is not checked.
> +		*/
> +		igt_assert_neq(subslice_total, 0);
> +		igt_info("subslice total: %u\n", subslice_total);
> +	}
> +}
> +
> +static void
> +eu_total(void)
> +{
> +	unsigned int eu_total = 0;
> +	int ret;
> +
> +	ret = getparam(LOCAL_I915_PARAM_EU_TOTAL, (int*)&eu_total);
> +
> +	if (ret) {
> +		/*
> +		 * These devices are not required to implement the
> +		 * interface. If they do not, -ENODEV must be returned.
> +		*/
> +		if ((intel_gen(devid) < 8) ||
> +		    IS_BROADWELL(devid) ||
> +		    igt_run_in_simulation()) {
> +			igt_assert_eq(ret, -ENODEV);
> +			igt_info("EU total: unknown\n");
> +		/*
> +		 * All other devices must implement the interface, so
> +		 * fail them if we are here.
> +		*/
> +		} else {
> +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> +			igt_assert_eq(ret, 0); /* other error? */
> +		}
> +	} else {
> +		/*
> +		 * On success, just make sure the returned count value is
> +		 * non-zero. The validity of the count value for the given
> +		 * device is not checked.
> +		*/
> +		igt_assert_neq(eu_total, 0);
> +		igt_info("EU total: %u\n", eu_total);
> +	}
> +}
> +
> +static void
> +exit_handler(int sig)
> +{
> +	deinit();
> +}
> +
> +igt_main
> +{
> +	igt_fixture {
> +		igt_install_exit_handler(exit_handler);
> +		init();
> +	}
> +
> +	igt_subtest("subslice-total")
> +		subslice_total();
> +
> +	igt_subtest("eu-total")
> +		eu_total();
> +}
> -- 
> 2.3.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

From daniel at ffwll.ch  Fri Mar 13 09:32:41 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 17:32:41 +0100
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t v3] tests/core_getparams:
 Create new test core_getparams
In-Reply-To: <20150313090945.GD21732@ivb-gt2-rev4>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
 <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
 <20150313090945.GD21732@ivb-gt2-rev4>
Message-ID: <20150313163241.GF3800@phenom.ffwll.local>

On Fri, Mar 13, 2015 at 05:09:46PM +0800, Zhigang Gong wrote:
> My only concern is about the following macros:
> 
> > +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL      33
> > +#define LOCAL_I915_PARAM_EU_TOTAL    34
> 
> How about to just use the definitons in the kernel header file?
> For an example:
> 
>   #include <drm/i915_drm.h>
> 
>   #ifdef LOCAL_I915_PARAM_SUBSLICE_TOTAL
>   //Put all the code into this block.
>   #endif
> 
> Then we can avoid put the same definitions in different files,
> and we can avoid unecessary testing on an old kernel which doesn't
> have this kernel interface.
> 
> For all the other part, it LGTM.
> 
> Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>

Once we update the libdrm requirements in igt we tend to go around and
replace all the now obsolete LOCAL_ defines. Imo not worth doing extra
work until then.

Patch applied, thanks.
-Daniel

> 
> Thanks,
> Zhigang Gong.
> 
> On Thu, Mar 12, 2015 at 05:26:25PM -0700, jeff.mcgee at intel.com wrote:
> > From: Jeff McGee <jeff.mcgee at intel.com>
> > 
> > New test core_getparams consists of 2 subtests, each one testing
> > the ability of userspace to query the correct value of a GT config
> > attribute: subslice total or EU total. drm/i915 implementation of
> > these queries is required for Cherryview and Gen9+ devices (non-
> > simulated).
> > 
> > v2: Duplicate small amount of new libdrm functionality to avoid
> >     bumping libdrm version requirement (Daniel). Convert some
> >     igt_asserts to the appropriate comparison variants. Add a
> >     test description.
> > v3: Actually use the LOCAL GETPARAM defines. Otherwise can't build
> >     against older libdrm as intended by v2.
> > 
> > For: VIZ-4636
> > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > ---
> >  tests/.gitignore       |   1 +
> >  tests/Makefile.sources |   1 +
> >  tests/core_getparams.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 169 insertions(+)
> >  create mode 100644 tests/core_getparams.c
> > 
> > diff --git a/tests/.gitignore b/tests/.gitignore
> > index 426cc67..c742308 100644
> > --- a/tests/.gitignore
> > +++ b/tests/.gitignore
> > @@ -1,6 +1,7 @@
> >  # Please keep sorted alphabetically
> >  core_get_client_auth
> >  core_getclient
> > +core_getparams
> >  core_getstats
> >  core_getversion
> >  drm_import_export
> > diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> > index 51e8376..999c8f8 100644
> > --- a/tests/Makefile.sources
> > +++ b/tests/Makefile.sources
> > @@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
> >  
> >  TESTS_progs_M = \
> >  	core_get_client_auth \
> > +	core_getparams \
> >  	drv_suspend \
> >  	drv_hangman \
> >  	gem_bad_reloc \
> > diff --git a/tests/core_getparams.c b/tests/core_getparams.c
> > new file mode 100644
> > index 0000000..2855d06
> > --- /dev/null
> > +++ b/tests/core_getparams.c
> > @@ -0,0 +1,167 @@
> > +/*
> > + * Copyright © 2014 Intel Corporation
> > + *
> > + * Permission is hereby granted, free of charge, to any person obtaining a
> > + * copy of this software and associated documentation files (the "Software"),
> > + * to deal in the Software without restriction, including without limitation
> > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > + * and/or sell copies of the Software, and to permit persons to whom the
> > + * Software is furnished to do so, subject to the following conditions:
> > + *
> > + * The above copyright notice and this permission notice (including the next
> > + * paragraph) shall be included in all copies or substantial portions of the
> > + * Software.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> > + * IN THE SOFTWARE.
> > + *
> > + * Authors:
> > + *    Jeff McGee <jeff.mcgee at intel.com>
> > + *
> > + */
> > +
> > +#include <unistd.h>
> > +#include <errno.h>
> > +#include <xf86drm.h>
> > +#include <i915_drm.h>
> > +#include "drmtest.h"
> > +#include "intel_chipset.h"
> > +#include "intel_bufmgr.h"
> > +
> > +IGT_TEST_DESCRIPTION("Tests the export of parameters via DRM_IOCTL_I915_GETPARAM\n");
> > +
> > +int drm_fd;
> > +int devid;
> > +
> > +static void
> > +init(void)
> > +{
> > +	drm_fd = drm_open_any();
> > +	devid = intel_get_drm_devid(drm_fd);
> > +}
> > +
> > +static void
> > +deinit(void)
> > +{
> > +	close(drm_fd);
> > +}
> > +
> > +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL	33
> > +#define LOCAL_I915_PARAM_EU_TOTAL	34
> > +
> > +static int
> > +getparam(int param, int *value)
> > +{
> > +	drm_i915_getparam_t gp;
> > +	int ret;
> > +
> > +	memset(&gp, 0, sizeof(gp));
> > +	gp.value = value;
> > +	gp.param = param;
> > +	ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
> > +	if (ret)
> > +		return -errno;
> > +
> > +	return 0;
> > +}
> > +
> > +static void
> > +subslice_total(void)
> > +{
> > +	unsigned int subslice_total = 0;
> > +	int ret;
> > +
> > +	ret = getparam(LOCAL_I915_PARAM_SUBSLICE_TOTAL, (int*)&subslice_total);
> > +
> > +	if (ret) {
> > +		/*
> > +		 * These devices are not required to implement the
> > +		 * interface. If they do not, -ENODEV must be returned.
> > +		*/
> > +		if ((intel_gen(devid) < 8) ||
> > +		    IS_BROADWELL(devid) ||
> > +		    igt_run_in_simulation()) {
> > +			igt_assert_eq(ret, -ENODEV);
> > +			igt_info("subslice total: unknown\n");
> > +		/*
> > +		 * All other devices must implement the interface, so
> > +		 * fail them if we are here.
> > +		*/
> > +		} else {
> > +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> > +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> > +			igt_assert_eq(ret, 0); /* other error? */
> > +		}
> > +	} else {
> > +		/*
> > +		 * On success, just make sure the returned count value is
> > +		 * non-zero. The validity of the count value for the given
> > +		 * device is not checked.
> > +		*/
> > +		igt_assert_neq(subslice_total, 0);
> > +		igt_info("subslice total: %u\n", subslice_total);
> > +	}
> > +}
> > +
> > +static void
> > +eu_total(void)
> > +{
> > +	unsigned int eu_total = 0;
> > +	int ret;
> > +
> > +	ret = getparam(LOCAL_I915_PARAM_EU_TOTAL, (int*)&eu_total);
> > +
> > +	if (ret) {
> > +		/*
> > +		 * These devices are not required to implement the
> > +		 * interface. If they do not, -ENODEV must be returned.
> > +		*/
> > +		if ((intel_gen(devid) < 8) ||
> > +		    IS_BROADWELL(devid) ||
> > +		    igt_run_in_simulation()) {
> > +			igt_assert_eq(ret, -ENODEV);
> > +			igt_info("EU total: unknown\n");
> > +		/*
> > +		 * All other devices must implement the interface, so
> > +		 * fail them if we are here.
> > +		*/
> > +		} else {
> > +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> > +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> > +			igt_assert_eq(ret, 0); /* other error? */
> > +		}
> > +	} else {
> > +		/*
> > +		 * On success, just make sure the returned count value is
> > +		 * non-zero. The validity of the count value for the given
> > +		 * device is not checked.
> > +		*/
> > +		igt_assert_neq(eu_total, 0);
> > +		igt_info("EU total: %u\n", eu_total);
> > +	}
> > +}
> > +
> > +static void
> > +exit_handler(int sig)
> > +{
> > +	deinit();
> > +}
> > +
> > +igt_main
> > +{
> > +	igt_fixture {
> > +		igt_install_exit_handler(exit_handler);
> > +		init();
> > +	}
> > +
> > +	igt_subtest("subslice-total")
> > +		subslice_total();
> > +
> > +	igt_subtest("eu-total")
> > +		eu_total();
> > +}
> > -- 
> > 2.3.0
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From david.weinehall at intel.com  Fri Mar 13 02:10:56 2015
From: david.weinehall at intel.com (David Weinehall)
Date: Fri, 13 Mar 2015 11:10:56 +0200
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150309120218.GD23680@nuc-i3427.alporthouse.com>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
Message-ID: <5502A9A0.1000209@intel.com>

On 2015-03-09 14:02, Chris Wilson wrote:
> On Mon, Mar 09, 2015 at 02:34:46AM +0000, Zou, Nanhai wrote:
>> We don't need MAP_FIXED, we just want to avoid address 0 to be allocated.
>>
>> Though I think using MAP_FIXED is overkill, will bring much unnecessary complexity on both kernel and beignet side.
>> I don't mind if people can provide stable MAP_FIXED patches to resolve this problem a few months or years later.
>>
>> At that time, kernel driver can revert the reserve page 0 patch.
>> Before that reserve page 0 can benefit all the Beignet user without breaking anything.
>
> The point is that is becomes ABI. So no the kernel can't just revert it.
> There is nothing special about address 0 in ether GTT or virtual memory.
> If you require a special object allocated at address 0, allocate a
> special object at address 0.

I've explained the ABI issue in a separate e-mail discussion, and I 
believe that they now fully understand what you meant.

That said, their main chain of reasoning makes some sense -- there is a
race condition if we rely on using MAP_FIXED, at least on systems that
do not support ppgtt. Ending up in a situation where opencl applications 
work on other hw, but fails when run on an i915-system would, at least 
in my opinion, not be ideal, no matter if it's due to an unfortunate design.

*If* a MAP_FIXED solution is decided upon, how can userland be sure that 
the GTT page mapped to 0 is indeed usable as the NULL pointer?
ON a PPGTT system that would be easy enough -- it's per process, so 
we'll be the only process allocating a page at 0, but if allocations
use a global address space that won't be possible to guarantee.

I realise that the first submitted patch didn't cover the GTT case, 
since the first indication I got was that not only was there a special 
case for GTT to need page 0 for other things, but also that this was 
"good enough" for opencl, but it seems that a full solution would be needed.

Since this is a memory area we're talking about it's not uncommon to 
have the 0th page represent the NULL pointer and impossible for 
applications to reserve, so it would hardly be an unusual and 
inexplicable solution.

All this said, how do "the other two" (NVidia, ATI) deal with this?
Implicit NULL-page, explicit MAP_FIXED allocation, or something else?


Kind regards, David
---------------------------------------------------------------------
Intel Finland Oy
Registered Address: PL 281, 00181 Helsinki 
Business Identity Code: 0357606 - 4 
Domiciled in Helsinki 

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.


From jeff.mcgee at intel.com  Fri Mar 13 09:51:57 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Fri, 13 Mar 2015 09:51:57 -0700
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t v3] tests/core_getparams:
 Create new test core_getparams
In-Reply-To: <20150313163241.GF3800@phenom.ffwll.local>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
 <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
 <20150313090945.GD21732@ivb-gt2-rev4>
 <20150313163241.GF3800@phenom.ffwll.local>
Message-ID: <20150313165156.GL3263@jeffdesk>

On Fri, Mar 13, 2015 at 05:32:41PM +0100, Daniel Vetter wrote:
> On Fri, Mar 13, 2015 at 05:09:46PM +0800, Zhigang Gong wrote:
> > My only concern is about the following macros:
> > 
> > > +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL      33
> > > +#define LOCAL_I915_PARAM_EU_TOTAL    34
> > 
> > How about to just use the definitons in the kernel header file?
> > For an example:
> > 
> >   #include <drm/i915_drm.h>
> > 
> >   #ifdef LOCAL_I915_PARAM_SUBSLICE_TOTAL
> >   //Put all the code into this block.
> >   #endif
> > 
> > Then we can avoid put the same definitions in different files,
> > and we can avoid unecessary testing on an old kernel which doesn't
> > have this kernel interface.
> > 
> > For all the other part, it LGTM.
> > 
> > Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> 
> Once we update the libdrm requirements in igt we tend to go around and
> replace all the now obsolete LOCAL_ defines. Imo not worth doing extra
> work until then.
> 
> Patch applied, thanks.
> -Daniel
> 

Patch applied? Do you want me to make the name change first? Should the
kernel part be reviewed and merged first?
-Jeff

> > 
> > Thanks,
> > Zhigang Gong.
> > 
> > On Thu, Mar 12, 2015 at 05:26:25PM -0700, jeff.mcgee at intel.com wrote:
> > > From: Jeff McGee <jeff.mcgee at intel.com>
> > > 
> > > New test core_getparams consists of 2 subtests, each one testing
> > > the ability of userspace to query the correct value of a GT config
> > > attribute: subslice total or EU total. drm/i915 implementation of
> > > these queries is required for Cherryview and Gen9+ devices (non-
> > > simulated).
> > > 
> > > v2: Duplicate small amount of new libdrm functionality to avoid
> > >     bumping libdrm version requirement (Daniel). Convert some
> > >     igt_asserts to the appropriate comparison variants. Add a
> > >     test description.
> > > v3: Actually use the LOCAL GETPARAM defines. Otherwise can't build
> > >     against older libdrm as intended by v2.
> > > 
> > > For: VIZ-4636
> > > Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> > > ---
> > >  tests/.gitignore       |   1 +
> > >  tests/Makefile.sources |   1 +
> > >  tests/core_getparams.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++
> > >  3 files changed, 169 insertions(+)
> > >  create mode 100644 tests/core_getparams.c
> > > 
> > > diff --git a/tests/.gitignore b/tests/.gitignore
> > > index 426cc67..c742308 100644
> > > --- a/tests/.gitignore
> > > +++ b/tests/.gitignore
> > > @@ -1,6 +1,7 @@
> > >  # Please keep sorted alphabetically
> > >  core_get_client_auth
> > >  core_getclient
> > > +core_getparams
> > >  core_getstats
> > >  core_getversion
> > >  drm_import_export
> > > diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> > > index 51e8376..999c8f8 100644
> > > --- a/tests/Makefile.sources
> > > +++ b/tests/Makefile.sources
> > > @@ -15,6 +15,7 @@ NOUVEAU_TESTS_M = \
> > >  
> > >  TESTS_progs_M = \
> > >  	core_get_client_auth \
> > > +	core_getparams \
> > >  	drv_suspend \
> > >  	drv_hangman \
> > >  	gem_bad_reloc \
> > > diff --git a/tests/core_getparams.c b/tests/core_getparams.c
> > > new file mode 100644
> > > index 0000000..2855d06
> > > --- /dev/null
> > > +++ b/tests/core_getparams.c
> > > @@ -0,0 +1,167 @@
> > > +/*
> > > + * Copyright © 2014 Intel Corporation
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person obtaining a
> > > + * copy of this software and associated documentation files (the "Software"),
> > > + * to deal in the Software without restriction, including without limitation
> > > + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > > + * and/or sell copies of the Software, and to permit persons to whom the
> > > + * Software is furnished to do so, subject to the following conditions:
> > > + *
> > > + * The above copyright notice and this permission notice (including the next
> > > + * paragraph) shall be included in all copies or substantial portions of the
> > > + * Software.
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> > > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> > > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> > > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> > > + * IN THE SOFTWARE.
> > > + *
> > > + * Authors:
> > > + *    Jeff McGee <jeff.mcgee at intel.com>
> > > + *
> > > + */
> > > +
> > > +#include <unistd.h>
> > > +#include <errno.h>
> > > +#include <xf86drm.h>
> > > +#include <i915_drm.h>
> > > +#include "drmtest.h"
> > > +#include "intel_chipset.h"
> > > +#include "intel_bufmgr.h"
> > > +
> > > +IGT_TEST_DESCRIPTION("Tests the export of parameters via DRM_IOCTL_I915_GETPARAM\n");
> > > +
> > > +int drm_fd;
> > > +int devid;
> > > +
> > > +static void
> > > +init(void)
> > > +{
> > > +	drm_fd = drm_open_any();
> > > +	devid = intel_get_drm_devid(drm_fd);
> > > +}
> > > +
> > > +static void
> > > +deinit(void)
> > > +{
> > > +	close(drm_fd);
> > > +}
> > > +
> > > +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL	33
> > > +#define LOCAL_I915_PARAM_EU_TOTAL	34
> > > +
> > > +static int
> > > +getparam(int param, int *value)
> > > +{
> > > +	drm_i915_getparam_t gp;
> > > +	int ret;
> > > +
> > > +	memset(&gp, 0, sizeof(gp));
> > > +	gp.value = value;
> > > +	gp.param = param;
> > > +	ret = drmIoctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp);
> > > +	if (ret)
> > > +		return -errno;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static void
> > > +subslice_total(void)
> > > +{
> > > +	unsigned int subslice_total = 0;
> > > +	int ret;
> > > +
> > > +	ret = getparam(LOCAL_I915_PARAM_SUBSLICE_TOTAL, (int*)&subslice_total);
> > > +
> > > +	if (ret) {
> > > +		/*
> > > +		 * These devices are not required to implement the
> > > +		 * interface. If they do not, -ENODEV must be returned.
> > > +		*/
> > > +		if ((intel_gen(devid) < 8) ||
> > > +		    IS_BROADWELL(devid) ||
> > > +		    igt_run_in_simulation()) {
> > > +			igt_assert_eq(ret, -ENODEV);
> > > +			igt_info("subslice total: unknown\n");
> > > +		/*
> > > +		 * All other devices must implement the interface, so
> > > +		 * fail them if we are here.
> > > +		*/
> > > +		} else {
> > > +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> > > +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> > > +			igt_assert_eq(ret, 0); /* other error? */
> > > +		}
> > > +	} else {
> > > +		/*
> > > +		 * On success, just make sure the returned count value is
> > > +		 * non-zero. The validity of the count value for the given
> > > +		 * device is not checked.
> > > +		*/
> > > +		igt_assert_neq(subslice_total, 0);
> > > +		igt_info("subslice total: %u\n", subslice_total);
> > > +	}
> > > +}
> > > +
> > > +static void
> > > +eu_total(void)
> > > +{
> > > +	unsigned int eu_total = 0;
> > > +	int ret;
> > > +
> > > +	ret = getparam(LOCAL_I915_PARAM_EU_TOTAL, (int*)&eu_total);
> > > +
> > > +	if (ret) {
> > > +		/*
> > > +		 * These devices are not required to implement the
> > > +		 * interface. If they do not, -ENODEV must be returned.
> > > +		*/
> > > +		if ((intel_gen(devid) < 8) ||
> > > +		    IS_BROADWELL(devid) ||
> > > +		    igt_run_in_simulation()) {
> > > +			igt_assert_eq(ret, -ENODEV);
> > > +			igt_info("EU total: unknown\n");
> > > +		/*
> > > +		 * All other devices must implement the interface, so
> > > +		 * fail them if we are here.
> > > +		*/
> > > +		} else {
> > > +			igt_assert_neq(ret, EINVAL); /* request not recognized? */
> > > +			igt_assert_neq(ret, ENODEV); /* device not supported? */
> > > +			igt_assert_eq(ret, 0); /* other error? */
> > > +		}
> > > +	} else {
> > > +		/*
> > > +		 * On success, just make sure the returned count value is
> > > +		 * non-zero. The validity of the count value for the given
> > > +		 * device is not checked.
> > > +		*/
> > > +		igt_assert_neq(eu_total, 0);
> > > +		igt_info("EU total: %u\n", eu_total);
> > > +	}
> > > +}
> > > +
> > > +static void
> > > +exit_handler(int sig)
> > > +{
> > > +	deinit();
> > > +}
> > > +
> > > +igt_main
> > > +{
> > > +	igt_fixture {
> > > +		igt_install_exit_handler(exit_handler);
> > > +		init();
> > > +	}
> > > +
> > > +	igt_subtest("subslice-total")
> > > +		subslice_total();
> > > +
> > > +	igt_subtest("eu-total")
> > > +		eu_total();
> > > +}
> > > -- 
> > > 2.3.0
> > > 
> > > _______________________________________________
> > > Intel-gfx mailing list
> > > Intel-gfx at lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From daniel at ffwll.ch  Fri Mar 13 09:59:13 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 17:59:13 +0100
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t v3] tests/core_getparams:
 Create new test core_getparams
In-Reply-To: <20150313165156.GL3263@jeffdesk>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
 <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
 <20150313090945.GD21732@ivb-gt2-rev4>
 <20150313163241.GF3800@phenom.ffwll.local>
 <20150313165156.GL3263@jeffdesk>
Message-ID: <20150313165913.GI3800@phenom.ffwll.local>

On Fri, Mar 13, 2015 at 09:51:57AM -0700, Jeff McGee wrote:
> On Fri, Mar 13, 2015 at 05:32:41PM +0100, Daniel Vetter wrote:
> > On Fri, Mar 13, 2015 at 05:09:46PM +0800, Zhigang Gong wrote:
> > > My only concern is about the following macros:
> > > 
> > > > +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL      33
> > > > +#define LOCAL_I915_PARAM_EU_TOTAL    34
> > > 
> > > How about to just use the definitons in the kernel header file?
> > > For an example:
> > > 
> > >   #include <drm/i915_drm.h>
> > > 
> > >   #ifdef LOCAL_I915_PARAM_SUBSLICE_TOTAL
> > >   //Put all the code into this block.
> > >   #endif
> > > 
> > > Then we can avoid put the same definitions in different files,
> > > and we can avoid unecessary testing on an old kernel which doesn't
> > > have this kernel interface.
> > > 
> > > For all the other part, it LGTM.
> > > 
> > > Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> > 
> > Once we update the libdrm requirements in igt we tend to go around and
> > replace all the now obsolete LOCAL_ defines. Imo not worth doing extra
> > work until then.
> > 
> > Patch applied, thanks.
> > -Daniel
> > 
> 
> Patch applied? Do you want me to make the name change first? Should the
> kernel part be reviewed and merged first?

Forgot my own review comment already ;-) Fixed up with a follow-up patch.
And I'll pull the kernel part in now too.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From chris at chris-wilson.co.uk  Fri Mar 13 09:58:47 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Fri, 13 Mar 2015 16:58:47 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150313092738.GD3800@phenom.ffwll.local>
References: <148B1B7A67D1C24B9EF0BE42EA4977062B7DCBFB@SHSMSX101.ccr.corp.intel.com>
 <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
Message-ID: <20150313165847.GA31491@nuc-i3427.alporthouse.com>

On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> If supporting systems without full ppgtt is a requirement for you (still
> wonky on gen8 a bit, so might be a good strategy) then imo it's the
> PIN_BIAS idea I've laid out earlier in this thread. That one will work
> everywhere. softpin can unexpectedly fail without full ppgtt if the kernel
> decides to put something at a given spot, which imo means we should only
> expose it on full ppgtt systems.
> 
> And PIN_BIAS should be fairly easy to wire up since the internal logic is
> all there already. So "just" needs an execbuf flag, igt test and
> appropriate userspace to set that new bit.

It doesn't though. To provide the guarantee userspace is asking for
(which is that address 0 goes to a special, preferrably inaccessible,
page), you have to evict the first N pages in the GGTT. That is just as
likely to fail with an execbuffer flag as it would with an execobject flag.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From daniel at ffwll.ch  Fri Mar 13 10:03:02 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 18:03:02 +0100
Subject: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
In-Reply-To: <016001d059fd$66ef9680$34cec380$@linux.intel.com>
References: <1425339452-18875-1-git-send-email-jeff.mcgee@intel.com>
 <20150305043555.GA20578@ivb-gt2-rev4>
 <20150306184418.GC3263@jeffdesk>
 <016001d059fd$66ef9680$34cec380$@linux.intel.com>
Message-ID: <20150313170302.GJ3800@phenom.ffwll.local>

On Mon, Mar 09, 2015 at 08:10:06AM +0800, Zhigang Gong wrote:
> > -----Original Message-----
> > From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> > Jeff McGee
> > Sent: Saturday, March 7, 2015 2:44 AM
> > To: Zhigang Gong
> > Cc: daniel at ffwll.ch; intel-gfx at lists.freedesktop.org;
> > beignet at lists.freedesktop.org; dri-devel at lists.freedesktop.org
> > Subject: Re: [Beignet] [PATCH] drm/i915: Export total subslice and EU counts
> > 
> > On Thu, Mar 05, 2015 at 12:35:55PM +0800, Zhigang Gong wrote:
> > > There is one minor conflict when apply the KMD patch to latest
> > > drm-intel-nightly branch. It should be easy to fix.
> > >
> > > Another issue is that IMO, we should bump libdrm's version number when
> > > increase these new APIs. Then in Beignet, we can check the libdrm
> > > version at build time and determine whether we will use these new
> > > interfaces. Thus, we can avoid breaking beignet on those systems which
> > > have previous libdrm/kernel installed.
> > >
> > Right. I can append a libdrm patch to bump the version. And then I suppose I
> > will follow the process to make a new release. Not sure right now how that
> > works. First time going through it.
> > 
> > Also, how should we test for the libdrm version and conditionally use the API?
> We can check the libdrm version at configuration time and define a macro to
> indicate whether we can use these new APIs in beignet.
> > Is there a previous example of this in Beignet that I could follow?
> Yes, one example is userptr. You can check the usage of DRM_INTEL_USERPTR and HAS_USERPTR
> In beignet.

Ok, applied the kernel patch. Please go ahead with libdrm&beignet parts.

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From jeff.mcgee at intel.com  Fri Mar 13 10:03:44 2015
From: jeff.mcgee at intel.com (Jeff McGee)
Date: Fri, 13 Mar 2015 10:03:44 -0700
Subject: [Beignet] [Intel-gfx] [PATCH i-g-t v3] tests/core_getparams:
 Create new test core_getparams
In-Reply-To: <20150313165913.GI3800@phenom.ffwll.local>
References: <1426192735-3306-1-git-send-email-jeff.mcgee@intel.com>
 <1426206385-4251-1-git-send-email-jeff.mcgee@intel.com>
 <20150313090945.GD21732@ivb-gt2-rev4>
 <20150313163241.GF3800@phenom.ffwll.local>
 <20150313165156.GL3263@jeffdesk>
 <20150313165913.GI3800@phenom.ffwll.local>
Message-ID: <20150313170344.GM3263@jeffdesk>

On Fri, Mar 13, 2015 at 05:59:13PM +0100, Daniel Vetter wrote:
> On Fri, Mar 13, 2015 at 09:51:57AM -0700, Jeff McGee wrote:
> > On Fri, Mar 13, 2015 at 05:32:41PM +0100, Daniel Vetter wrote:
> > > On Fri, Mar 13, 2015 at 05:09:46PM +0800, Zhigang Gong wrote:
> > > > My only concern is about the following macros:
> > > > 
> > > > > +#define LOCAL_I915_PARAM_SUBSLICE_TOTAL      33
> > > > > +#define LOCAL_I915_PARAM_EU_TOTAL    34
> > > > 
> > > > How about to just use the definitons in the kernel header file?
> > > > For an example:
> > > > 
> > > >   #include <drm/i915_drm.h>
> > > > 
> > > >   #ifdef LOCAL_I915_PARAM_SUBSLICE_TOTAL
> > > >   //Put all the code into this block.
> > > >   #endif
> > > > 
> > > > Then we can avoid put the same definitions in different files,
> > > > and we can avoid unecessary testing on an old kernel which doesn't
> > > > have this kernel interface.
> > > > 
> > > > For all the other part, it LGTM.
> > > > 
> > > > Reviewed-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> > > 
> > > Once we update the libdrm requirements in igt we tend to go around and
> > > replace all the now obsolete LOCAL_ defines. Imo not worth doing extra
> > > work until then.
> > > 
> > > Patch applied, thanks.
> > > -Daniel
> > > 
> > 
> > Patch applied? Do you want me to make the name change first? Should the
> > kernel part be reviewed and merged first?
> 
> Forgot my own review comment already ;-) Fixed up with a follow-up patch.
> And I'll pull the kernel part in now too.
> -Daniel

Thanks. That leaves just the libdrm patch and version bump before Beignet
can merge their patches.
-Jeff

From daniel at ffwll.ch  Fri Mar 13 10:13:39 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 18:13:39 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150313165847.GA31491@nuc-i3427.alporthouse.com>
References: <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
Message-ID: <20150313171339.GL3800@phenom.ffwll.local>

On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> > If supporting systems without full ppgtt is a requirement for you (still
> > wonky on gen8 a bit, so might be a good strategy) then imo it's the
> > PIN_BIAS idea I've laid out earlier in this thread. That one will work
> > everywhere. softpin can unexpectedly fail without full ppgtt if the kernel
> > decides to put something at a given spot, which imo means we should only
> > expose it on full ppgtt systems.
> > 
> > And PIN_BIAS should be fairly easy to wire up since the internal logic is
> > all there already. So "just" needs an execbuf flag, igt test and
> > appropriate userspace to set that new bit.
> 
> It doesn't though. To provide the guarantee userspace is asking for
> (which is that address 0 goes to a special, preferrably inaccessible,
> page), you have to evict the first N pages in the GGTT. That is just as
> likely to fail with an execbuffer flag as it would with an execobject flag.

Afaiui userspace only needs the guarantee that NULL is never a valid
address. Which means it's never a valid address for its own buffer
objects. I don't think it cares one bit what's actually there, it's not
mandatory to fault apparently. And faulting is what's not possible.

I guess the standard is like normal C: If you access a NULL pointer,
anything can happen (including garbage on the frontbuffer), the only
guarantee you need to make is that NULL is never a valid address. At least
if no one plays tricks ;-)
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From chris at chris-wilson.co.uk  Fri Mar 13 10:34:22 2015
From: chris at chris-wilson.co.uk (Chris Wilson)
Date: Fri, 13 Mar 2015 17:34:22 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150313171339.GL3800@phenom.ffwll.local>
References: <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
Message-ID: <20150313173422.GA19856@nuc-i3427.alporthouse.com>

On Fri, Mar 13, 2015 at 06:13:39PM +0100, Daniel Vetter wrote:
> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> > On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> > > If supporting systems without full ppgtt is a requirement for you (still
> > > wonky on gen8 a bit, so might be a good strategy) then imo it's the
> > > PIN_BIAS idea I've laid out earlier in this thread. That one will work
> > > everywhere. softpin can unexpectedly fail without full ppgtt if the kernel
> > > decides to put something at a given spot, which imo means we should only
> > > expose it on full ppgtt systems.
> > > 
> > > And PIN_BIAS should be fairly easy to wire up since the internal logic is
> > > all there already. So "just" needs an execbuf flag, igt test and
> > > appropriate userspace to set that new bit.
> > 
> > It doesn't though. To provide the guarantee userspace is asking for
> > (which is that address 0 goes to a special, preferrably inaccessible,
> > page), you have to evict the first N pages in the GGTT. That is just as
> > likely to fail with an execbuffer flag as it would with an execobject flag.
> 
> Afaiui userspace only needs the guarantee that NULL is never a valid
> address. Which means it's never a valid address for its own buffer
> objects. I don't think it cares one bit what's actually there, it's not
> mandatory to fault apparently. And faulting is what's not possible.

You are bending ABI to allow userspace to use absolute addressing (no
relocations). The kernel has to make sure that nothing else is at that
address.
 
> I guess the standard is like normal C: If you access a NULL pointer,
> anything can happen (including garbage on the frontbuffer), the only
> guarantee you need to make is that NULL is never a valid address. At least
> if no one plays tricks ;-)

No. The kernel is quite strict about *NULL. It certainly doesn't allow
trivial information leakage.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

From daniel at ffwll.ch  Fri Mar 13 10:49:50 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Fri, 13 Mar 2015 18:49:50 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150313173422.GA19856@nuc-i3427.alporthouse.com>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <20150313173422.GA19856@nuc-i3427.alporthouse.com>
Message-ID: <20150313174950.GR3800@phenom.ffwll.local>

On Fri, Mar 13, 2015 at 05:34:22PM +0000, Chris Wilson wrote:
> On Fri, Mar 13, 2015 at 06:13:39PM +0100, Daniel Vetter wrote:
> > On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> > > On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> > > > If supporting systems without full ppgtt is a requirement for you (still
> > > > wonky on gen8 a bit, so might be a good strategy) then imo it's the
> > > > PIN_BIAS idea I've laid out earlier in this thread. That one will work
> > > > everywhere. softpin can unexpectedly fail without full ppgtt if the kernel
> > > > decides to put something at a given spot, which imo means we should only
> > > > expose it on full ppgtt systems.
> > > > 
> > > > And PIN_BIAS should be fairly easy to wire up since the internal logic is
> > > > all there already. So "just" needs an execbuf flag, igt test and
> > > > appropriate userspace to set that new bit.
> > > 
> > > It doesn't though. To provide the guarantee userspace is asking for
> > > (which is that address 0 goes to a special, preferrably inaccessible,
> > > page), you have to evict the first N pages in the GGTT. That is just as
> > > likely to fail with an execbuffer flag as it would with an execobject flag.
> > 
> > Afaiui userspace only needs the guarantee that NULL is never a valid
> > address. Which means it's never a valid address for its own buffer
> > objects. I don't think it cares one bit what's actually there, it's not
> > mandatory to fault apparently. And faulting is what's not possible.
> 
> You are bending ABI to allow userspace to use absolute addressing (no
> relocations). The kernel has to make sure that nothing else is at that
> address.
>  
> > I guess the standard is like normal C: If you access a NULL pointer,
> > anything can happen (including garbage on the frontbuffer), the only
> > guarantee you need to make is that NULL is never a valid address. At least
> > if no one plays tricks ;-)
> 
> No. The kernel is quite strict about *NULL. It certainly doesn't allow
> trivial information leakage.

We already have that trivial information leakage anyway. Just because we
make it a notch more likely that a special address will leak information
that doesn't make it different that without full ppgtt you can read any
other gpu clients stuff easily. And also write to it if you feel like.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From ruiling.song at intel.com  Sun Mar 15 19:29:24 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Mon, 16 Mar 2015 02:29:24 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150313171339.GL3800@phenom.ffwll.local>
References: <20150305125251.GA18775@phenom.ffwll.local>
 <20150305130121.GA18784@nuc-i3427.alporthouse.com>
 <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com> <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>


> -----Original Message-----
> From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of Daniel
> Vetter
> Sent: Saturday, March 14, 2015 1:14 AM
> To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai; Song, Ruiling;
> Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
> beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
> allocation
> 
> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> > On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> > > If supporting systems without full ppgtt is a requirement for you
> > > (still wonky on gen8 a bit, so might be a good strategy) then imo
> > > it's the PIN_BIAS idea I've laid out earlier in this thread. That
> > > one will work everywhere. softpin can unexpectedly fail without full
> > > ppgtt if the kernel decides to put something at a given spot, which
> > > imo means we should only expose it on full ppgtt systems.
> > >
> > > And PIN_BIAS should be fairly easy to wire up since the internal
> > > logic is all there already. So "just" needs an execbuf flag, igt
> > > test and appropriate userspace to set that new bit.
> >
> > It doesn't though. To provide the guarantee userspace is asking for
> > (which is that address 0 goes to a special, preferrably inaccessible,
> > page), you have to evict the first N pages in the GGTT. That is just
> > as likely to fail with an execbuffer flag as it would with an execobject flag.
> 
> Afaiui userspace only needs the guarantee that NULL is never a valid address.
> Which means it's never a valid address for its own buffer objects. I don't
> think it cares one bit what's actually there, it's not mandatory to fault
> apparently. And faulting is what's not possible.
Yes, This is what exactly what we need, that is make NULL as an invalid address. It's just like C language.
But I have some more comment. The buffer object used in opencl may be allocated in libva/opengl and shared for opencl usage through some opencl extension.
Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer object allocation.
Will libdrm re-bind such kind of shared buffer object to a new graphics virtual address?
So that PIN_BIAS is also effective on the shared buffer, right?

Thanks!
Ruiling
> I guess the standard is like normal C: If you access a NULL pointer, anything
> can happen (including garbage on the frontbuffer), the only guarantee you
> need to make is that NULL is never a valid address. At least if no one plays
> tricks ;-) -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch

From zhigang.gong at intel.com  Sun Mar 15 18:42:25 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Mon, 16 Mar 2015 09:42:25 +0800
Subject: [Beignet] [PATCH] GBE: fix an image related bugs.
Message-ID: <1426470145-4399-1-git-send-email-zhigang.gong@intel.com>

The bug was introduces when we removed the hacky invalid
register. Now we will not pass in a fixed count of coordinates
for the typed_write instruction.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 13 +++++++------
 backend/src/ir/instruction.cpp             | 28 +++++++++++++++++-----------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2b166b1..c240261 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -4411,19 +4411,20 @@ namespace gbe
           msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
       } else {
         uint32_t valueID = 0;
-        msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        for(uint32_t msgID = 1; msgID < 1 + dim; msgID++, valueID++)
+        uint32_t msgID = 0;
+        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(; msgID < 1 + dim; msgID++, valueID++)
           msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
 
         // fake v.
         if (dim < 2)
-          msgs[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         // fake w.
         if (dim < 3)
-          msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         // LOD.
-        msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
-        for(uint32_t msgID = dim + 2; valueID < insn.getSrcNum(); msgID++, valueID++)
+        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(; valueID < insn.getSrcNum(); msgID++, valueID++)
           msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
       }
 
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index a2bc875..797552f 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -522,10 +522,13 @@ namespace ir {
         this->outOpcode(out);
         out << "." << this->getDstType()
             << "." << this->getSrcType()
-            << " surface id " << (int)this->getImageIndex()
-            << " coord u %" << this->getSrc(fn, 0)
-            << " coord v %" << this->getSrc(fn, 1)
-            << " coord w %" << this->getSrc(fn, 2)
+            << " surface id " << (int)this->getImageIndex();
+        out << " coord u %" << this->getSrc(fn, 0);
+        if (srcNum >= 2)
+          out << " coord v %" << this->getSrc(fn, 1);
+        if (srcNum >= 3)
+          out << " coord w %" << this->getSrc(fn, 2);
+        out
             << " %" << this->getDst(fn, 0)
             << " %" << this->getDst(fn, 1)
             << " %" << this->getDst(fn, 2)
@@ -567,15 +570,18 @@ namespace ir {
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
+        uint32_t srcID = 0;
         out << "." << this->getSrcType()
             << " surface id " << (int)this->getImageIndex()
-            << " coord u %" << this->getSrc(fn, 0)
-            << " coord v %" << this->getSrc(fn, 1)
-            << " coord w %" << this->getSrc(fn, 2)
-            << " %" << this->getSrc(fn, 3)
-            << " %" << this->getSrc(fn, 4)
-            << " %" << this->getSrc(fn, 5)
-            << " %" << this->getSrc(fn, 6);
+            << " coord u %" << this->getSrc(fn, srcID++);
+        if (srcNum >= 6)
+          out << " coord v %" << this->getSrc(fn, srcID++);
+        if (srcNum >= 7)
+          out << " coord w %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
+        out   << " %" << this->getSrc(fn, srcID++);
       }
 
       Tuple src;
-- 
1.9.1


From rong.r.yang at intel.com  Sun Mar 15 20:16:03 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Mon, 16 Mar 2015 03:16:03 +0000
Subject: [Beignet] [PATCH] GBE: fix an image related bugs.
In-Reply-To: <1426470145-4399-1-git-send-email-zhigang.gong@intel.com>
References: <1426470145-4399-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090141BECD@shsmsx102.ccr.corp.intel.com>

LGTM, thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Zhigang Gong
> Sent: Monday, March 16, 2015 09:42
> To: beignet at lists.freedesktop.org
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH] GBE: fix an image related bugs.
> 
> The bug was introduces when we removed the hacky invalid register. Now
> we will not pass in a fixed count of coordinates for the typed_write
> instruction.
> 
> Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 13 +++++++------
>  backend/src/ir/instruction.cpp             | 28 +++++++++++++++++-----------
>  2 files changed, 24 insertions(+), 17 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 2b166b1..c240261 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -4411,19 +4411,20 @@ namespace gbe
>            msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
>        } else {
>          uint32_t valueID = 0;
> -        msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> -        for(uint32_t msgID = 1; msgID < 1 + dim; msgID++, valueID++)
> +        uint32_t msgID = 0;
> +        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> +        for(; msgID < 1 + dim; msgID++, valueID++)
>            msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
> 
>          // fake v.
>          if (dim < 2)
> -          msgs[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> +          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
>          // fake w.
>          if (dim < 3)
> -          msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> +          msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
>          // LOD.
> -        msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> -        for(uint32_t msgID = dim + 2; valueID < insn.getSrcNum(); msgID++,
> valueID++)
> +        msgs[msgID++] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> +        for(; valueID < insn.getSrcNum(); msgID++, valueID++)
>            msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
>        }
> 
> diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
> index a2bc875..797552f 100644
> --- a/backend/src/ir/instruction.cpp
> +++ b/backend/src/ir/instruction.cpp
> @@ -522,10 +522,13 @@ namespace ir {
>          this->outOpcode(out);
>          out << "." << this->getDstType()
>              << "." << this->getSrcType()
> -            << " surface id " << (int)this->getImageIndex()
> -            << " coord u %" << this->getSrc(fn, 0)
> -            << " coord v %" << this->getSrc(fn, 1)
> -            << " coord w %" << this->getSrc(fn, 2)
> +            << " surface id " << (int)this->getImageIndex();
> +        out << " coord u %" << this->getSrc(fn, 0);
> +        if (srcNum >= 2)
> +          out << " coord v %" << this->getSrc(fn, 1);
> +        if (srcNum >= 3)
> +          out << " coord w %" << this->getSrc(fn, 2);
> +        out
>              << " %" << this->getDst(fn, 0)
>              << " %" << this->getDst(fn, 1)
>              << " %" << this->getDst(fn, 2) @@ -567,15 +570,18 @@ namespace ir {
>        INLINE bool wellFormed(const Function &fn, std::string &why) const;
>        INLINE void out(std::ostream &out, const Function &fn) const {
>          this->outOpcode(out);
> +        uint32_t srcID = 0;
>          out << "." << this->getSrcType()
>              << " surface id " << (int)this->getImageIndex()
> -            << " coord u %" << this->getSrc(fn, 0)
> -            << " coord v %" << this->getSrc(fn, 1)
> -            << " coord w %" << this->getSrc(fn, 2)
> -            << " %" << this->getSrc(fn, 3)
> -            << " %" << this->getSrc(fn, 4)
> -            << " %" << this->getSrc(fn, 5)
> -            << " %" << this->getSrc(fn, 6);
> +            << " coord u %" << this->getSrc(fn, srcID++);
> +        if (srcNum >= 6)
> +          out << " coord v %" << this->getSrc(fn, srcID++);
> +        if (srcNum >= 7)
> +          out << " coord w %" << this->getSrc(fn, srcID++);
> +        out   << " %" << this->getSrc(fn, srcID++);
> +        out   << " %" << this->getSrc(fn, srcID++);
> +        out   << " %" << this->getSrc(fn, srcID++);
> +        out   << " %" << this->getSrc(fn, srcID++);
>        }
> 
>        Tuple src;
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Sun Mar 15 19:26:12 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 16 Mar 2015 10:26:12 +0800
Subject: [Beignet] [PATCH V2 1/3] reset the SPIR target datalayout.
In-Reply-To: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>
References: <1426235561-21847-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <20150316022611.GE21732@ivb-gt2-rev4>

LGTM, pushed. Thanks.

On Fri, Mar 13, 2015 at 04:32:39PM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> v2: split to a seperate patch.
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/llvm/llvm_bitcode_link.cpp | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
> index 17248c0..3bf9613 100644
> --- a/backend/src/llvm/llvm_bitcode_link.cpp
> +++ b/backend/src/llvm/llvm_bitcode_link.cpp
> @@ -237,6 +237,10 @@ namespace gbe
>        kernels.push_back(f);
>      }
>  
> +    /* the SPIR binary datalayout maybe different with beignet's bitcode */
> +    if(clonedLib->getDataLayout() != mod->getDataLayout())
> +      mod->setDataLayout(clonedLib->getDataLayout());
> +
>      /* We use beignet's bitcode as dst because it will have a lot of
>         lazy functions which will not be loaded. */
>      char* errorMsg;
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From daniel at ffwll.ch  Mon Mar 16 01:52:14 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Mon, 16 Mar 2015 09:52:14 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
Message-ID: <20150316085214.GF21993@phenom.ffwll.local>

On Mon, Mar 16, 2015 at 02:29:24AM +0000, Song, Ruiling wrote:
> 
> 
> > -----Original Message-----
> > From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of Daniel
> > Vetter
> > Sent: Saturday, March 14, 2015 1:14 AM
> > To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai; Song, Ruiling;
> > Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
> > beignet at lists.freedesktop.org
> > Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
> > allocation
> > 
> > On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> > > On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> > > > If supporting systems without full ppgtt is a requirement for you
> > > > (still wonky on gen8 a bit, so might be a good strategy) then imo
> > > > it's the PIN_BIAS idea I've laid out earlier in this thread. That
> > > > one will work everywhere. softpin can unexpectedly fail without full
> > > > ppgtt if the kernel decides to put something at a given spot, which
> > > > imo means we should only expose it on full ppgtt systems.
> > > >
> > > > And PIN_BIAS should be fairly easy to wire up since the internal
> > > > logic is all there already. So "just" needs an execbuf flag, igt
> > > > test and appropriate userspace to set that new bit.
> > >
> > > It doesn't though. To provide the guarantee userspace is asking for
> > > (which is that address 0 goes to a special, preferrably inaccessible,
> > > page), you have to evict the first N pages in the GGTT. That is just
> > > as likely to fail with an execbuffer flag as it would with an execobject flag.
> > 
> > Afaiui userspace only needs the guarantee that NULL is never a valid address.
> > Which means it's never a valid address for its own buffer objects. I don't
> > think it cares one bit what's actually there, it's not mandatory to fault
> > apparently. And faulting is what's not possible.
> Yes, This is what exactly what we need, that is make NULL as an invalid address. It's just like C language.
> But I have some more comment. The buffer object used in opencl may be allocated in libva/opengl and shared for opencl usage through some opencl extension.
> Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer object allocation.
> Will libdrm re-bind such kind of shared buffer object to a new graphics virtual address?
> So that PIN_BIAS is also effective on the shared buffer, right?

Yeah we'll rebind if needed. We can make this an execbuf or context flag,
in either case anything that gets executed by ocl will be moved around if
it accidentally ended up at the wrong place. The only exception is if a
buffer is pinned already, i.e. if you're doing direct rendering to the
frontbuffer. That will give you an EBUSY, but otoh that also shouldn't
ever happen really.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

From zhigang.gong at linux.intel.com  Mon Mar 16 01:12:59 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Mon, 16 Mar 2015 16:12:59 +0800
Subject: [Beignet] [ANNOUNCE] Beignet 1.0.2 (2015-03-16)
Message-ID: <20150316081258.GG21732@ivb-gt2-rev4>

Beignet 1.0.2 (2015-03-16)
==========================

Beignet 1.0.2 has been released. It's a bug fix release including some bug fixes, minor new
features and some performance improvements. The highlighted improvements are as below: 

1. Added LLVM 3.6 support.
2. Fixed a bug in popcount().
3. Refactor the legalize pass and const expression handling.
4. Fixed a bug when hardware is inaccessible.
5. Enabled multiarch 32bit and 64bit coexisting.
6. Implemented some new device side clCopyImage kernels.
7. Optimize clEnqueueCopyImageToBuffer for 16 aligned buffer.

Git tag: Release_v1.0.2
Gitweb URL: http://cgit.freedesktop.org/beignet
Download: https://01.org/sites/default/files/beignet-1.0.2-source.tar.gz
Official release notes: https://01.org/beignet/downloads/beignet-1.0.2-2015-03-16

md5sum: 437b5d26a947002c9200a9571355b1c6  Beignet-1.0.2-Source.tar.gz
sha1sum: f8377bf02b18af9b51114ea89e7a1460079f10a0  Beignet-1.0.2-Source.tar.gz
sha256sum: 2521fa523123b25c05e42e1c11c89d4ed7f28b335552c82a8883c61c235c8729  Beignet-1.0.2-Source.tar.gz

-----------------------------------------------------------------

Andreas Beckmann (2):
      prefer newer llvm versions over 3.3
      remove unsafe define -D__$(USER)__

Chuanbo Weng (4):
      Add example to show libva buffer sharing with extension clCreateImageFromLibvaIntel.
      Add document to describe the detials of libva buffer sharing.
      Implement 1D/2D image array related cl_mem_kernel_copy_image in cl way instead of cpu way.
      Optimization of clEnqueueCopyImageToBuffer for 16 aligned case.

Guo Yejun (3):
      correct the cache line size to be 64
      loose the alignment limitation for host_ptr of CL_MEM_USE_HOST_PTR
      update utest to loose userptr limitation

Junyan He (6):
      Add test case for long bitcast.
      Add the check for src and dst span different registers.
      Fix bug for bitcast test case because of long type.
      Fix a bug of 1d image array test case.
      Backend: Fix one bug of printf because of ir reorder.
      Correct the bit fields error for indirect address of Gen8

Luo (1):
      check the predication in case of endless loop.

Rebecca N. Palmer (3):
      Return error, don't crash, on allocation failure
      Crash when hardware inaccessible
      Enable multiarch (32/64-bit co-installation)

Ruiling Song (14):
      GBE: Import constantexpr lower pass from pNaCl
      GBE: expand large integer instructions
      GBE: Fix a build error against llvm release version
      GBE: Fix a bug in legalize pass.
      GBE: Load/store should use same address space as before.
      GBE: Import PromoteIntegers pass from pNaCl
      GBE: We need use exiting block here.
      libocl: define NULL to zero
      libocl: Directly scalarize built-in with vector input.
      GBE: unify element type before insertelement in legalize pass.
      GBE: Support unaligned load/store of dword/qword in GenIR.
      GBE: Fix fast-math issue under llvm 3.6.
      GBE: Only add non-zero offset in gep lowering pass.
      GBE: Only emit multiply when immediate is not one.

Yang Rong (6):
      Change the KB and MB define to enum.
      Use llvm-c's LLVMLinkModules instead of llvm::Linker::LinkModules.
      Add llvm3.6 build support.
      Remove useless llvm head file FindUsedTypes.h.
      Correct the error llvm link msg copy in function genProgramLinkFromLLVM.
      Fix llvm3.6 build error.

Zhigang Gong (19):
      GBE: fix an ACC register related instruction scheduling bug
      GBE: fix popcount bugs.
      GBE: add GEN_TYPE_HF to getTypeSize.
      Add submodule libva for examples.
      update document.
      runtime: fix a potential null pointer dereference.
      runtime: don't free the host_ptr for a subbuffer.
      GBE: fix build error for llvm 3.6.
      GBE: fix build error for LLVM 3.4/3.3.
      build: use @BEIGNET_INSTALL_DIR@ for the icd file.
      GBE: expand constant expressions in constant vector
      GBE: remove constant expression handling code in gen writer pass.
      GBE: remove the unecessary type check for SEL instructio.
      GBE: support compare two bool variables.
      GBE: add fastcall support.
      Build: use -Bsymbolic to fix conflicts with other LLVM users.
      GBE: add a new incompatible compile option -cl-finite-math-only.
      Build: set 3.5 as the stable LLVM version for beignet.
      Bump version to 1.0.2

--
Thanks,
Zhigang Gong.

From i.gnatenko.brain at gmail.com  Mon Mar 16 05:39:58 2015
From: i.gnatenko.brain at gmail.com (Igor Gnatenko)
Date: Mon, 16 Mar 2015 15:39:58 +0300
Subject: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
Message-ID: <CAFMg4WDj_phWh5JuOk+rGqcvipzzXUa-JgRViXscmR=+BGLJMA@mail.gmail.com>

Hi,

I'm updating beignet to 1.0.2 in Fedora as packager and see that on
x86_64 I have CMAKE_LIBRARY_ARCHITECTURE equal to nothing and have
intel-beignet-.icd.

P.S. don't have time to file a bug.
-- 
-Igor Gnatenko

From junyan.he at inbox.com  Mon Mar 16 08:27:20 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon, 16 Mar 2015 23:27:20 +0800
Subject: [Beignet] [PATCH] Add LLVM_INCLUDE_DIR to CMakeList of src.
Message-ID: <1426519640-17377-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

The llvm include should be specified when llvm is
not installed in standard dir.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/CMakeLists.txt |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d4181d8..88b2792 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,7 +3,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
                     ${DRM_INCLUDE_DIRS}/../
                     ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include
-                    ${MESA_SOURCE_INCLUDES})
+                    ${MESA_SOURCE_INCLUDES}
+                    ${LLVM_INCLUDE_DIR})
 
 macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
 foreach (KF ${KERNEL_FILES})
-- 
1.7.9.5


From jbarnes at virtuousgeek.org  Mon Mar 16 13:10:28 2015
From: jbarnes at virtuousgeek.org (Jesse Barnes)
Date: Mon, 16 Mar 2015 13:10:28 -0700
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150316085214.GF21993@phenom.ffwll.local>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com> <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
Message-ID: <550738B4.8060405@virtuousgeek.org>

On 03/16/2015 01:52 AM, Daniel Vetter wrote:
> On Mon, Mar 16, 2015 at 02:29:24AM +0000, Song, Ruiling wrote:
>>
>>
>>> -----Original Message-----
>>> From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of Daniel
>>> Vetter
>>> Sent: Saturday, March 14, 2015 1:14 AM
>>> To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai; Song, Ruiling;
>>> Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
>>> beignet at lists.freedesktop.org
>>> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
>>> allocation
>>>
>>> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
>>>> On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
>>>>> If supporting systems without full ppgtt is a requirement for you
>>>>> (still wonky on gen8 a bit, so might be a good strategy) then imo
>>>>> it's the PIN_BIAS idea I've laid out earlier in this thread. That
>>>>> one will work everywhere. softpin can unexpectedly fail without full
>>>>> ppgtt if the kernel decides to put something at a given spot, which
>>>>> imo means we should only expose it on full ppgtt systems.
>>>>>
>>>>> And PIN_BIAS should be fairly easy to wire up since the internal
>>>>> logic is all there already. So "just" needs an execbuf flag, igt
>>>>> test and appropriate userspace to set that new bit.
>>>>
>>>> It doesn't though. To provide the guarantee userspace is asking for
>>>> (which is that address 0 goes to a special, preferrably inaccessible,
>>>> page), you have to evict the first N pages in the GGTT. That is just
>>>> as likely to fail with an execbuffer flag as it would with an execobject flag.
>>>
>>> Afaiui userspace only needs the guarantee that NULL is never a valid address.
>>> Which means it's never a valid address for its own buffer objects. I don't
>>> think it cares one bit what's actually there, it's not mandatory to fault
>>> apparently. And faulting is what's not possible.
>> Yes, This is what exactly what we need, that is make NULL as an invalid address. It's just like C language.
>> But I have some more comment. The buffer object used in opencl may be allocated in libva/opengl and shared for opencl usage through some opencl extension.
>> Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer object allocation.
>> Will libdrm re-bind such kind of shared buffer object to a new graphics virtual address?
>> So that PIN_BIAS is also effective on the shared buffer, right?
> 
> Yeah we'll rebind if needed. We can make this an execbuf or context flag,
> in either case anything that gets executed by ocl will be moved around if
> it accidentally ended up at the wrong place. The only exception is if a
> buffer is pinned already, i.e. if you're doing direct rendering to the
> frontbuffer. That will give you an EBUSY, but otoh that also shouldn't
> ever happen really.

Ruiling, are you working on this or someone from your team, presumably
based on the patch Chris posted earlier?  The zero page reservation
certainly seems simpler to me, but the MAP_FIXED approach is a lot more
flexible, and can be used for other types of debug and usages as well
(we'll need something like it for OCL pointer sharing for example), so
seems like a good thing to pursue regardless.

Thanks,
Jesse


From jbarnes at virtuousgeek.org  Mon Mar 16 13:11:00 2015
From: jbarnes at virtuousgeek.org (Jesse Barnes)
Date: Mon, 16 Mar 2015 13:11:00 -0700
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <20150316085214.GF21993@phenom.ffwll.local>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com> <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
Message-ID: <550738D4.6070907@virtuousgeek.org>

On 03/16/2015 01:52 AM, Daniel Vetter wrote:
> On Mon, Mar 16, 2015 at 02:29:24AM +0000, Song, Ruiling wrote:
>>
>>
>>> -----Original Message-----
>>> From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of Daniel
>>> Vetter
>>> Sent: Saturday, March 14, 2015 1:14 AM
>>> To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai; Song, Ruiling;
>>> Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
>>> beignet at lists.freedesktop.org
>>> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
>>> allocation
>>>
>>> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
>>>> On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
>>>>> If supporting systems without full ppgtt is a requirement for you
>>>>> (still wonky on gen8 a bit, so might be a good strategy) then imo
>>>>> it's the PIN_BIAS idea I've laid out earlier in this thread. That
>>>>> one will work everywhere. softpin can unexpectedly fail without full
>>>>> ppgtt if the kernel decides to put something at a given spot, which
>>>>> imo means we should only expose it on full ppgtt systems.
>>>>>
>>>>> And PIN_BIAS should be fairly easy to wire up since the internal
>>>>> logic is all there already. So "just" needs an execbuf flag, igt
>>>>> test and appropriate userspace to set that new bit.
>>>>
>>>> It doesn't though. To provide the guarantee userspace is asking for
>>>> (which is that address 0 goes to a special, preferrably inaccessible,
>>>> page), you have to evict the first N pages in the GGTT. That is just
>>>> as likely to fail with an execbuffer flag as it would with an execobject flag.
>>>
>>> Afaiui userspace only needs the guarantee that NULL is never a valid address.
>>> Which means it's never a valid address for its own buffer objects. I don't
>>> think it cares one bit what's actually there, it's not mandatory to fault
>>> apparently. And faulting is what's not possible.
>> Yes, This is what exactly what we need, that is make NULL as an invalid address. It's just like C language.
>> But I have some more comment. The buffer object used in opencl may be allocated in libva/opengl and shared for opencl usage through some opencl extension.
>> Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer object allocation.
>> Will libdrm re-bind such kind of shared buffer object to a new graphics virtual address?
>> So that PIN_BIAS is also effective on the shared buffer, right?
> 
> Yeah we'll rebind if needed. We can make this an execbuf or context flag,
> in either case anything that gets executed by ocl will be moved around if
> it accidentally ended up at the wrong place. The only exception is if a
> buffer is pinned already, i.e. if you're doing direct rendering to the
> frontbuffer. That will give you an EBUSY, but otoh that also shouldn't
> ever happen really.

Oh and we may need to re-introduce the ppgtt zero page reservation
anyway due to bugs, but that's another topic...


From zhigang.gong at intel.com  Mon Mar 16 17:28:04 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 08:28:04 +0800
Subject: [Beignet] [PATCH] Build: fix the beignet icd name when
	CMAKE_INSTALL_FULL_LIBDIR is undefined.
Message-ID: <1426552084-16012-1-git-send-email-zhigang.gong@intel.com>

On some distributions, the CMAKE_INSTALL_FULL_LIBDIR or CMAKE_LIBRARY_ARCHITECTURE
may be undefined. To avoid generate intel-beignet-.icd file name, we need to get
rid of the extra "-" for such case.

Reported by Igor Gnatenko.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9aa838a..ae3b313 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,9 @@ INCLUDE (GNUInstallDirs OPTIONAL)
 # support old CMake without GNUInstallDirs
 if (NOT CMAKE_INSTALL_FULL_LIBDIR)
   set (CMAKE_INSTALL_FULL_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib")
-  set (CMAKE_LIBRARY_ARCHITECTURE "")
+  set (BEIGNET_LIBRARY_ARCHITECTURE "")
+else (NOT CMAKE_INSTALL_FULL_LIBDIR)
+  set (BEIGNET_LIBRARY_ARCHITECTURE "${CMAKE_LIBRARY_ARCHITECTURE}")
 endif (NOT CMAKE_INSTALL_FULL_LIBDIR)
 
 if (NOT LIB_INSTALL_DIR)
@@ -58,7 +60,11 @@ endif (NOT BEIGNET_INSTALL_DIR)
 if (BEIGNET_INSTALL_DIR STREQUAL "${CMAKE_INSTALL_PREFIX}/lib/beignet/")
   set (ICD_FILE_NAME "intel-beignet.icd")
 else (BEIGNET_INSTALL_DIR STREQUAL "${CMAKE_INSTALL_PREFIX}/lib/beignet/")
-  set (ICD_FILE_NAME "intel-beignet-${CMAKE_LIBRARY_ARCHITECTURE}.icd")
+  if (BEIGNET_LIBRARY_ARCHITECTURE STREQUAL "")
+    set (ICD_FILE_NAME "intel-beignet.icd")
+  else (BEIGNET_LIBRARY_ARCHITECTURE STREQUAL "")
+    set (ICD_FILE_NAME "intel-beignet-${BEIGNET_LIBRARY_ARCHITECTURE}.icd")
+  endif (BEIGNET_LIBRARY_ARCHITECTURE STREQUAL "")
 endif (BEIGNET_INSTALL_DIR STREQUAL "${CMAKE_INSTALL_PREFIX}/lib/beignet/")
 
 # Force Release with debug info
-- 
1.9.1


From zhigang.gong at linux.intel.com  Mon Mar 16 17:29:18 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 08:29:18 +0800
Subject: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
In-Reply-To: <CAFMg4WDj_phWh5JuOk+rGqcvipzzXUa-JgRViXscmR=+BGLJMA@mail.gmail.com>
References: <CAFMg4WDj_phWh5JuOk+rGqcvipzzXUa-JgRViXscmR=+BGLJMA@mail.gmail.com>
Message-ID: <20150317002917.GH21732@ivb-gt2-rev4>

It seems a specified distribution related issue. It works fine for Debian
and Ubuntu. I just sent out a patch, could you test it in your system?

Thanks,
Zhigang Gong.

On Mon, Mar 16, 2015 at 03:39:58PM +0300, Igor Gnatenko wrote:
> Hi,
> 
> I'm updating beignet to 1.0.2 in Fedora as packager and see that on
> x86_64 I have CMAKE_LIBRARY_ARCHITECTURE equal to nothing and have
> intel-beignet-.icd.
> 
> P.S. don't have time to file a bug.
> -- 
> -Igor Gnatenko
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Mon Mar 16 18:19:40 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 09:19:40 +0800
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <550738B4.8060405@virtuousgeek.org>
References: <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
Message-ID: <20150317011940.GI21732@ivb-gt2-rev4>

On Mon, Mar 16, 2015 at 01:10:28PM -0700, Jesse Barnes wrote:
> On 03/16/2015 01:52 AM, Daniel Vetter wrote:
> > On Mon, Mar 16, 2015 at 02:29:24AM +0000, Song, Ruiling wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of Daniel
> >>> Vetter
> >>> Sent: Saturday, March 14, 2015 1:14 AM
> >>> To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai; Song, Ruiling;
> >>> Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
> >>> beignet at lists.freedesktop.org
> >>> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
> >>> allocation
> >>>
> >>> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> >>>> On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> >>>>> If supporting systems without full ppgtt is a requirement for you
> >>>>> (still wonky on gen8 a bit, so might be a good strategy) then imo
> >>>>> it's the PIN_BIAS idea I've laid out earlier in this thread. That
> >>>>> one will work everywhere. softpin can unexpectedly fail without full
> >>>>> ppgtt if the kernel decides to put something at a given spot, which
> >>>>> imo means we should only expose it on full ppgtt systems.
> >>>>>
> >>>>> And PIN_BIAS should be fairly easy to wire up since the internal
> >>>>> logic is all there already. So "just" needs an execbuf flag, igt
> >>>>> test and appropriate userspace to set that new bit.
> >>>>
> >>>> It doesn't though. To provide the guarantee userspace is asking for
> >>>> (which is that address 0 goes to a special, preferrably inaccessible,
> >>>> page), you have to evict the first N pages in the GGTT. That is just
> >>>> as likely to fail with an execbuffer flag as it would with an execobject flag.
> >>>
> >>> Afaiui userspace only needs the guarantee that NULL is never a valid address.
> >>> Which means it's never a valid address for its own buffer objects. I don't
> >>> think it cares one bit what's actually there, it's not mandatory to fault
> >>> apparently. And faulting is what's not possible.
> >> Yes, This is what exactly what we need, that is make NULL as an invalid address. It's just like C language.
> >> But I have some more comment. The buffer object used in opencl may be allocated in libva/opengl and shared for opencl usage through some opencl extension.
> >> Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer object allocation.
> >> Will libdrm re-bind such kind of shared buffer object to a new graphics virtual address?
> >> So that PIN_BIAS is also effective on the shared buffer, right?
> > 
> > Yeah we'll rebind if needed. We can make this an execbuf or context flag,
> > in either case anything that gets executed by ocl will be moved around if
> > it accidentally ended up at the wrong place. The only exception is if a
> > buffer is pinned already, i.e. if you're doing direct rendering to the
> > frontbuffer. That will give you an EBUSY, but otoh that also shouldn't
> > ever happen really.
> 
> Ruiling, are you working on this or someone from your team, presumably
> based on the patch Chris posted earlier?  The zero page reservation
> certainly seems simpler to me, but the MAP_FIXED approach is a lot more
> flexible, and can be used for other types of debug and usages as well
> (we'll need something like it for OCL pointer sharing for example), so
> seems like a good thing to pursue regardless.

Jesse, here is my 2 cent comment:

IMO, the start of this discussion is from a very simple issue:
1. Make sure any valid buffer object will not be mapped to zero page.
   OCL wants to avoid 0 virtual address for any valid buffer object. Thus
   the following type of code will work as expected.
   __kernel foo(__global int *src, __global int* dst) {
     if (dst == NULL)
       return;
   }
   Although the above type code may not make much sense, but it should
   be supported as well.
   This is a simple requirement for all the bo's binding operations from
   CL application. And we think that simply skip the zero-page in KMD is
   a good enough solution and don't see any side-effect till now.

Then let's talk about the MAP_FIXED approach:
2. About the MAP_FIXED approach.
  If my understanding is correct, this is a intermediate solution towards FULL
  SVM function.
  From OCL's view, it is usefull for the 2.0's. There are three types of SVM
  options. Even for the most basic "Coarse-Grained buffer SVM", something like
  MAP_FIXED is useful. Becuase it needs to pass a linked list object to the
  OCL kernel directly. If we can use MAP_FIXED to map the bo to the VM address
  which is used by host, then we can access such a linked list easily on OCL
  kernel side, otherwise, we may need some tricks to do extra address adjustment.
  So this feature is useful for Beignet before we get full SVM support.

My conclusion is:
* The 1st is a totally *passive* requirement and need to be applied to each
  buffer binding. And it's better to be transparent to userspace.
* The 2nd is an *active* requirement and need the kernel to provide SVM like
  interface to the user space. And only those SVM buffers (for the first 2 OCL
  SVM options) need to call the new interfaces explicitly.

Thanks,
Zhigang Gong.

> 
> Thanks,
> Jesse
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From nanhai.zou at intel.com  Mon Mar 16 19:29:09 2015
From: nanhai.zou at intel.com (Zou, Nanhai)
Date: Tue, 17 Mar 2015 02:29:09 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual
	address	allocation
In-Reply-To: <550738B4.8060405@virtuousgeek.org>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com> <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
Message-ID: <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>

> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces at lists.freedesktop.org] On Behalf Of
> Jesse Barnes
> Sent: Tuesday, March 17, 2015 4:10 AM
> To: Daniel Vetter; Song, Ruiling
> Cc: Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
> beignet at lists.freedesktop.org; Weinehall, David
> Subject: Re: [Intel-gfx] [Beignet] Preventing zero GPU virtual address allocation
> 
> On 03/16/2015 01:52 AM, Daniel Vetter wrote:
> > On Mon, Mar 16, 2015 at 02:29:24AM +0000, Song, Ruiling wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of
> >>> Daniel Vetter
> >>> Sent: Saturday, March 14, 2015 1:14 AM
> >>> To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai;
> >>> Song, Ruiling; Vetter, Daniel; intel-gfx at lists.freedesktop.org;
> >>> Yang, Rong R; beignet at lists.freedesktop.org
> >>> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual
> >>> address allocation
> >>>
> >>> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> >>>> On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> >>>>> If supporting systems without full ppgtt is a requirement for you
> >>>>> (still wonky on gen8 a bit, so might be a good strategy) then imo
> >>>>> it's the PIN_BIAS idea I've laid out earlier in this thread. That
> >>>>> one will work everywhere. softpin can unexpectedly fail without
> >>>>> full ppgtt if the kernel decides to put something at a given spot,
> >>>>> which imo means we should only expose it on full ppgtt systems.
> >>>>>
> >>>>> And PIN_BIAS should be fairly easy to wire up since the internal
> >>>>> logic is all there already. So "just" needs an execbuf flag, igt
> >>>>> test and appropriate userspace to set that new bit.
> >>>>
> >>>> It doesn't though. To provide the guarantee userspace is asking for
> >>>> (which is that address 0 goes to a special, preferrably
> >>>> inaccessible, page), you have to evict the first N pages in the
> >>>> GGTT. That is just as likely to fail with an execbuffer flag as it would with
> an execobject flag.
> >>>
> >>> Afaiui userspace only needs the guarantee that NULL is never a valid
> address.
> >>> Which means it's never a valid address for its own buffer objects. I
> >>> don't think it cares one bit what's actually there, it's not
> >>> mandatory to fault apparently. And faulting is what's not possible.
> >> Yes, This is what exactly what we need, that is make NULL as an invalid
> address. It's just like C language.
> >> But I have some more comment. The buffer object used in opencl may be
> allocated in libva/opengl and shared for opencl usage through some opencl
> extension.
> >> Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer
> object allocation.
> >> Will libdrm re-bind such kind of shared buffer object to a new graphics virtual
> address?
> >> So that PIN_BIAS is also effective on the shared buffer, right?
> >
> > Yeah we'll rebind if needed. We can make this an execbuf or context
> > flag, in either case anything that gets executed by ocl will be moved
> > around if it accidentally ended up at the wrong place. The only
> > exception is if a buffer is pinned already, i.e. if you're doing
> > direct rendering to the frontbuffer. That will give you an EBUSY, but
> > otoh that also shouldn't ever happen really.
> 
> Ruiling, are you working on this or someone from your team, presumably based
> on the patch Chris posted earlier?  The zero page reservation certainly seems
> simpler to me, but the MAP_FIXED approach is a lot more flexible, and can be
> used for other types of debug and usages as well (we'll need something like it
> for OCL pointer sharing for example), so seems like a good thing to pursue
> regardless.
> 
Hi Jesse,
	MAP_FIXED cannot solve this issue. You may see my previous comments for this topic, 
There could be many components in on single process, Beignet cannot be guaranteed to be the first one who has allocated address 0.

Thanks
Zou Nanhai

> Thanks,
> Jesse
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

From xionghu.luo at intel.com  Mon Mar 16 22:09:27 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 17 Mar 2015 13:09:27 +0800
Subject: [Beignet] [PATCH] strip PointerCast for call instructions before
	use.
Message-ID: <1426568967-20412-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

the callInst may contains bitcast instruction if the function's is
different with the decleration. strip the bitcast instruction to get
the real name.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_bitcode_link.cpp | 3 +--
 backend/src/llvm/llvm_gen_backend.cpp  | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 3bf9613..ebf4386 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -96,8 +96,7 @@ namespace gbe
             call->getCalledFunction()->getIntrinsicID() != 0)
           continue;
 
-        Value *Callee = call->getCalledValue();
-        const std::string fnName = Callee->getName();
+        std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
 
         if (!MFS.insert(fnName).second) {
           continue;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 773300b..8c68e12 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2628,7 +2628,8 @@ namespace gbe
       }
     }
     // Get the name of the called function and handle it
-    const std::string fnName = Callee->getName();
+    const std::string fnName = Callee->stripPointerCasts()->getName();
+    printf(": %s\n", fnName.c_str());
     auto genIntrinsicID = intrinsicMap.find(fnName);
     switch (genIntrinsicID) {
       case GEN_OCL_GET_GROUP_ID0:
@@ -3018,7 +3019,7 @@ namespace gbe
       } else {
         // Get the name of the called function and handle it
         Value *Callee = I.getCalledValue();
-        const std::string fnName = Callee->getName();
+        const std::string fnName = Callee->stripPointerCasts()->getName();
         auto genIntrinsicID = intrinsicMap.find(fnName);
 
         // Get the function arguments
-- 
1.9.1


From xionghu.luo at intel.com  Mon Mar 16 22:25:49 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Tue, 17 Mar 2015 13:25:49 +0800
Subject: [Beignet] [patch v2] strip PointerCast for call instructions before
	use.
Message-ID: <1426569949-21894-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

the callInst may contains bitcast instruction if the function's is
different with the decleration. strip the bitcast instruction to get
the real name.

v2: remove printf message.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_bitcode_link.cpp | 3 +--
 backend/src/llvm/llvm_gen_backend.cpp  | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 3bf9613..ebf4386 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -96,8 +96,7 @@ namespace gbe
             call->getCalledFunction()->getIntrinsicID() != 0)
           continue;
 
-        Value *Callee = call->getCalledValue();
-        const std::string fnName = Callee->getName();
+        std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
 
         if (!MFS.insert(fnName).second) {
           continue;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 773300b..bf03a13 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2628,7 +2628,7 @@ namespace gbe
       }
     }
     // Get the name of the called function and handle it
-    const std::string fnName = Callee->getName();
+    const std::string fnName = Callee->stripPointerCasts()->getName();
     auto genIntrinsicID = intrinsicMap.find(fnName);
     switch (genIntrinsicID) {
       case GEN_OCL_GET_GROUP_ID0:
@@ -3018,7 +3018,7 @@ namespace gbe
       } else {
         // Get the name of the called function and handle it
         Value *Callee = I.getCalledValue();
-        const std::string fnName = Callee->getName();
+        const std::string fnName = Callee->stripPointerCasts()->getName();
         auto genIntrinsicID = intrinsicMap.find(fnName);
 
         // Get the function arguments
-- 
1.9.1


From yejun.guo at intel.com  Mon Mar 16 22:52:15 2015
From: yejun.guo at intel.com (Guo, Yejun)
Date: Tue, 17 Mar 2015 05:52:15 +0000
Subject: [Beignet] [patch v2] strip PointerCast for call instructions
 before	use.
In-Reply-To: <1426569949-21894-1-git-send-email-xionghu.luo@intel.com>
References: <1426569949-21894-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <854E8DBA9F41904AB047E03BB6963AE501BEE03F@SHSMSX101.ccr.corp.intel.com>

LGTM, thanks.

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of xionghu.luo at intel.com
Sent: Tuesday, March 17, 2015 1:26 PM
To: beignet at lists.freedesktop.org
Cc: Luo, Xionghu
Subject: [Beignet] [patch v2] strip PointerCast for call instructions before use.

From: Luo Xionghu <xionghu.luo at intel.com>

the callInst may contains bitcast instruction if the function's is
different with the decleration. strip the bitcast instruction to get
the real name.

v2: remove printf message.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 backend/src/llvm/llvm_bitcode_link.cpp | 3 +--
 backend/src/llvm/llvm_gen_backend.cpp  | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
index 3bf9613..ebf4386 100644
--- a/backend/src/llvm/llvm_bitcode_link.cpp
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -96,8 +96,7 @@ namespace gbe
             call->getCalledFunction()->getIntrinsicID() != 0)
           continue;
 
-        Value *Callee = call->getCalledValue();
-        const std::string fnName = Callee->getName();
+        std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
 
         if (!MFS.insert(fnName).second) {
           continue;
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 773300b..bf03a13 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2628,7 +2628,7 @@ namespace gbe
       }
     }
     // Get the name of the called function and handle it
-    const std::string fnName = Callee->getName();
+    const std::string fnName = Callee->stripPointerCasts()->getName();
     auto genIntrinsicID = intrinsicMap.find(fnName);
     switch (genIntrinsicID) {
       case GEN_OCL_GET_GROUP_ID0:
@@ -3018,7 +3018,7 @@ namespace gbe
       } else {
         // Get the name of the called function and handle it
         Value *Callee = I.getCalledValue();
-        const std::string fnName = Callee->getName();
+        const std::string fnName = Callee->stripPointerCasts()->getName();
         auto genIntrinsicID = intrinsicMap.find(fnName);
 
         // Get the function arguments
-- 
1.9.1

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet

From yejun.guo at intel.com  Mon Mar 16 23:03:19 2015
From: yejun.guo at intel.com (Guo Yejun)
Date: Tue, 17 Mar 2015 14:03:19 +0800
Subject: [Beignet] [PATCH] correct env var to output llvm IR
Message-ID: <1426572199-25039-1-git-send-email-yejun.guo@intel.com>

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 docs/Beignet/Backend.mdwn | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/Beignet/Backend.mdwn b/docs/Beignet/Backend.mdwn
index e4259fb..cf80318 100644
--- a/docs/Beignet/Backend.mdwn
+++ b/docs/Beignet/Backend.mdwn
@@ -48,10 +48,12 @@ Environment variables are used all over the code. Most important ones are:
 - `OCL_OUTPUT_GEN_IR` `(0 or 1)`. Output Gen IR (scalar intermediate
   representation) code
 
-- `OCL_OUTPUT_LLVM` `(0 or 1)`. Output LLVM code after the lowering passes
+- `OCL_OUTPUT_LLVM_BEFORE_LINK` `(0 or 1)`. Output LLVM code before llvm link
 
-- `OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS` `(0 or 1)`. Output LLVM code before the
-  lowering passes
+- `OCL_OUTPUT_LLVM_AFTER_LINK` `(0 or 1)`. Output LLVM code after llvm link
+
+- `OCL_OUTPUT_LLVM_AFTER_GEN` `(0 or 1)`. Output LLVM code after the lowering
+  passes, Gen IR is generated based on it.
 
 - `OCL_OUTPUT_ASM` `(0 or 1)`. Output Gen ISA
 
-- 
1.9.1


From daniel at ffwll.ch  Tue Mar 17 03:01:02 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Tue, 17 Mar 2015 11:01:02 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <550738B4.8060405@virtuousgeek.org>
References: <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
Message-ID: <20150317100102.GQ21993@phenom.ffwll.local>

On Mon, Mar 16, 2015 at 01:10:28PM -0700, Jesse Barnes wrote:
> On 03/16/2015 01:52 AM, Daniel Vetter wrote:
> > On Mon, Mar 16, 2015 at 02:29:24AM +0000, Song, Ruiling wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Daniel Vetter [mailto:daniel.vetter at ffwll.ch] On Behalf Of Daniel
> >>> Vetter
> >>> Sent: Saturday, March 14, 2015 1:14 AM
> >>> To: Chris Wilson; Daniel Vetter; Weinehall, David; Zou, Nanhai; Song, Ruiling;
> >>> Vetter, Daniel; intel-gfx at lists.freedesktop.org; Yang, Rong R;
> >>> beignet at lists.freedesktop.org
> >>> Subject: Re: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
> >>> allocation
> >>>
> >>> On Fri, Mar 13, 2015 at 04:58:47PM +0000, Chris Wilson wrote:
> >>>> On Fri, Mar 13, 2015 at 10:27:38AM +0100, Daniel Vetter wrote:
> >>>>> If supporting systems without full ppgtt is a requirement for you
> >>>>> (still wonky on gen8 a bit, so might be a good strategy) then imo
> >>>>> it's the PIN_BIAS idea I've laid out earlier in this thread. That
> >>>>> one will work everywhere. softpin can unexpectedly fail without full
> >>>>> ppgtt if the kernel decides to put something at a given spot, which
> >>>>> imo means we should only expose it on full ppgtt systems.
> >>>>>
> >>>>> And PIN_BIAS should be fairly easy to wire up since the internal
> >>>>> logic is all there already. So "just" needs an execbuf flag, igt
> >>>>> test and appropriate userspace to set that new bit.
> >>>>
> >>>> It doesn't though. To provide the guarantee userspace is asking for
> >>>> (which is that address 0 goes to a special, preferrably inaccessible,
> >>>> page), you have to evict the first N pages in the GGTT. That is just
> >>>> as likely to fail with an execbuffer flag as it would with an execobject flag.
> >>>
> >>> Afaiui userspace only needs the guarantee that NULL is never a valid address.
> >>> Which means it's never a valid address for its own buffer objects. I don't
> >>> think it cares one bit what's actually there, it's not mandatory to fault
> >>> apparently. And faulting is what's not possible.
> >> Yes, This is what exactly what we need, that is make NULL as an invalid address. It's just like C language.
> >> But I have some more comment. The buffer object used in opencl may be allocated in libva/opengl and shared for opencl usage through some opencl extension.
> >> Afaiui, this would implicitly require libva/mesa also avoid zero-address buffer object allocation.
> >> Will libdrm re-bind such kind of shared buffer object to a new graphics virtual address?
> >> So that PIN_BIAS is also effective on the shared buffer, right?
> > 
> > Yeah we'll rebind if needed. We can make this an execbuf or context flag,
> > in either case anything that gets executed by ocl will be moved around if
> > it accidentally ended up at the wrong place. The only exception is if a
> > buffer is pinned already, i.e. if you're doing direct rendering to the
> > frontbuffer. That will give you an EBUSY, but otoh that also shouldn't
> > ever happen really.
> 
> Ruiling, are you working on this or someone from your team, presumably
> based on the patch Chris posted earlier?  The zero page reservation
> certainly seems simpler to me, but the MAP_FIXED approach is a lot more
> flexible, and can be used for other types of debug and usages as well
> (we'll need something like it for OCL pointer sharing for example), so
> seems like a good thing to pursue regardless.

I prefer not to merge fix MAP_FIXED with the justification that ocl will
need the full power of it for buffered svm, without the pieces really
being ready. And I don't think we should block this little fix for current
ocl on the svm work either. Hence why I suggested to just expose the
already existing PIN_BIAS support somehow. That also has the upside of
working without full ppgtt (i.e. on hsw and earlier).
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

From zhigang.gong at linux.intel.com  Tue Mar 17 01:58:30 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 16:58:30 +0800
Subject: [Beignet] [patch v2] strip PointerCast for call instructions
 before use.
In-Reply-To: <1426569949-21894-1-git-send-email-xionghu.luo@intel.com>
References: <1426569949-21894-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <20150317085829.GJ21732@ivb-gt2-rev4>

LGTM, pushed, thanks.

On Tue, Mar 17, 2015 at 01:25:49PM +0800, xionghu.luo at intel.com wrote:
> From: Luo Xionghu <xionghu.luo at intel.com>
> 
> the callInst may contains bitcast instruction if the function's is
> different with the decleration. strip the bitcast instruction to get
> the real name.
> 
> v2: remove printf message.
> Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
> ---
>  backend/src/llvm/llvm_bitcode_link.cpp | 3 +--
>  backend/src/llvm/llvm_gen_backend.cpp  | 4 ++--
>  2 files changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
> index 3bf9613..ebf4386 100644
> --- a/backend/src/llvm/llvm_bitcode_link.cpp
> +++ b/backend/src/llvm/llvm_bitcode_link.cpp
> @@ -96,8 +96,7 @@ namespace gbe
>              call->getCalledFunction()->getIntrinsicID() != 0)
>            continue;
>  
> -        Value *Callee = call->getCalledValue();
> -        const std::string fnName = Callee->getName();
> +        std::string fnName = call->getCalledValue()->stripPointerCasts()->getName();
>  
>          if (!MFS.insert(fnName).second) {
>            continue;
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index 773300b..bf03a13 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2628,7 +2628,7 @@ namespace gbe
>        }
>      }
>      // Get the name of the called function and handle it
> -    const std::string fnName = Callee->getName();
> +    const std::string fnName = Callee->stripPointerCasts()->getName();
>      auto genIntrinsicID = intrinsicMap.find(fnName);
>      switch (genIntrinsicID) {
>        case GEN_OCL_GET_GROUP_ID0:
> @@ -3018,7 +3018,7 @@ namespace gbe
>        } else {
>          // Get the name of the called function and handle it
>          Value *Callee = I.getCalledValue();
> -        const std::string fnName = Callee->getName();
> +        const std::string fnName = Callee->stripPointerCasts()->getName();
>          auto genIntrinsicID = intrinsicMap.find(fnName);
>  
>          // Get the function arguments
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Tue Mar 17 01:58:57 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 16:58:57 +0800
Subject: [Beignet] [PATCH] Add LLVM_INCLUDE_DIR to CMakeList of src.
In-Reply-To: <1426519640-17377-1-git-send-email-junyan.he@inbox.com>
References: <1426519640-17377-1-git-send-email-junyan.he@inbox.com>
Message-ID: <20150317085856.GK21732@ivb-gt2-rev4>

LGTM, pushed, thanks.

On Mon, Mar 16, 2015 at 11:27:20PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> The llvm include should be specified when llvm is
> not installed in standard dir.
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  src/CMakeLists.txt |    3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index d4181d8..88b2792 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -3,7 +3,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
>                      ${DRM_INCLUDE_DIRS}/../
>                      ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
>                      ${CMAKE_CURRENT_SOURCE_DIR}/../include
> -                    ${MESA_SOURCE_INCLUDES})
> +                    ${MESA_SOURCE_INCLUDES}
> +                    ${LLVM_INCLUDE_DIR})
>  
>  macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
>  foreach (KF ${KERNEL_FILES})
> -- 
> 1.7.9.5
> 
> 
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Tue Mar 17 01:59:15 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 16:59:15 +0800
Subject: [Beignet] [PATCH] correct env var to output llvm IR
In-Reply-To: <1426572199-25039-1-git-send-email-yejun.guo@intel.com>
References: <1426572199-25039-1-git-send-email-yejun.guo@intel.com>
Message-ID: <20150317085914.GL21732@ivb-gt2-rev4>

LGTM, pushed, thanks.

On Tue, Mar 17, 2015 at 02:03:19PM +0800, Guo Yejun wrote:
> Signed-off-by: Guo Yejun <yejun.guo at intel.com>
> ---
>  docs/Beignet/Backend.mdwn | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/docs/Beignet/Backend.mdwn b/docs/Beignet/Backend.mdwn
> index e4259fb..cf80318 100644
> --- a/docs/Beignet/Backend.mdwn
> +++ b/docs/Beignet/Backend.mdwn
> @@ -48,10 +48,12 @@ Environment variables are used all over the code. Most important ones are:
>  - `OCL_OUTPUT_GEN_IR` `(0 or 1)`. Output Gen IR (scalar intermediate
>    representation) code
>  
> -- `OCL_OUTPUT_LLVM` `(0 or 1)`. Output LLVM code after the lowering passes
> +- `OCL_OUTPUT_LLVM_BEFORE_LINK` `(0 or 1)`. Output LLVM code before llvm link
>  
> -- `OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS` `(0 or 1)`. Output LLVM code before the
> -  lowering passes
> +- `OCL_OUTPUT_LLVM_AFTER_LINK` `(0 or 1)`. Output LLVM code after llvm link
> +
> +- `OCL_OUTPUT_LLVM_AFTER_GEN` `(0 or 1)`. Output LLVM code after the lowering
> +  passes, Gen IR is generated based on it.
>  
>  - `OCL_OUTPUT_ASM` `(0 or 1)`. Output Gen ISA
>  
> -- 
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From junyan.he at inbox.com  Tue Mar 17 03:08:40 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Tue, 17 Mar 2015 18:08:40 +0800
Subject: [Beignet] [PATCH] Generate NAN for UNDEF value in printf parser.
Message-ID: <1426586920-30846-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

llvm 3.6 will give a UNDEF value for NAN. The will cause
the store instruction for UNDEF to be ignored. We need
to modify it to NAN here.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/llvm/llvm_printf_parser.cpp |    6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 7800c01..2f85443 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -831,6 +831,12 @@ error:
 
       case Type::DoubleTyID:
       case Type::FloatTyID: {
+        /* llvm 3.6 will give a undef value for NAN. */
+        if (dyn_cast<llvm::UndefValue>(arg)) {
+          APFloat nan = APFloat::getNaN(APFloat::IEEEsingle, false);
+          arg = ConstantFP::get(module->getContext(), nan);
+        }
+
         /* Because the printf is a variable parameter function, it does not have the
            function prototype, so the compiler will always promote the arg to the
            longest precise type for float. So here, we can always find it is double. */
-- 
1.7.9.5


From i.gnatenko.brain at gmail.com  Tue Mar 17 05:11:47 2015
From: i.gnatenko.brain at gmail.com (Igor Gnatenko)
Date: Tue, 17 Mar 2015 15:11:47 +0300
Subject: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
In-Reply-To: <20150317002917.GH21732@ivb-gt2-rev4>
References: <CAFMg4WDj_phWh5JuOk+rGqcvipzzXUa-JgRViXscmR=+BGLJMA@mail.gmail.com>
 <20150317002917.GH21732@ivb-gt2-rev4>
Message-ID: <CAFMg4WBXgzg79amQKNROp6sQzR7wnOZz6c1opGveRhR1vQQm-g@mail.gmail.com>

yes, it works. But how about more better solution to deal with arches?

On Tue, Mar 17, 2015 at 3:29 AM, Zhigang Gong
<zhigang.gong at linux.intel.com> wrote:
> It seems a specified distribution related issue. It works fine for Debian
> and Ubuntu. I just sent out a patch, could you test it in your system?
>
> Thanks,
> Zhigang Gong.
>
> On Mon, Mar 16, 2015 at 03:39:58PM +0300, Igor Gnatenko wrote:
>> Hi,
>>
>> I'm updating beignet to 1.0.2 in Fedora as packager and see that on
>> x86_64 I have CMAKE_LIBRARY_ARCHITECTURE equal to nothing and have
>> intel-beignet-.icd.
>>
>> P.S. don't have time to file a bug.
>> --
>> -Igor Gnatenko
>> _______________________________________________
>> Beignet mailing list
>> Beignet at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet


-- 
-Igor Gnatenko

From zhigang.gong at gmail.com  Tue Mar 17 05:20:02 2015
From: zhigang.gong at gmail.com (Zhigang Gong)
Date: Tue, 17 Mar 2015 20:20:02 +0800
Subject: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
In-Reply-To: <CAFMg4WBXgzg79amQKNROp6sQzR7wnOZz6c1opGveRhR1vQQm-g@mail.gmail.com>
References: <CAFMg4WDj_phWh5JuOk+rGqcvipzzXUa-JgRViXscmR=+BGLJMA@mail.gmail.com>
 <20150317002917.GH21732@ivb-gt2-rev4>
 <CAFMg4WBXgzg79amQKNROp6sQzR7wnOZz6c1opGveRhR1vQQm-g@mail.gmail.com>
Message-ID: <CAFfUhOVa_78odZNU99D+J7R7Sgr-YZxW41oiM-3iz_09DZ7OSA@mail.gmail.com>

This should be a cmake and Fedora specific problem.  Could you help to pass
the question
 "why CMAKE_LIBRARY_ARCHITECTURE and CMAKE_INSTALL_FULL_LIBDIR  are empty
under Fedora system?" to the corresponding Fedora community?

Thanks,
Zhigang Gong.

On Tue, Mar 17, 2015 at 8:11 PM, Igor Gnatenko <i.gnatenko.brain at gmail.com>
wrote:

> yes, it works. But how about more better solution to deal with arches?
>
> On Tue, Mar 17, 2015 at 3:29 AM, Zhigang Gong
> <zhigang.gong at linux.intel.com> wrote:
> > It seems a specified distribution related issue. It works fine for Debian
> > and Ubuntu. I just sent out a patch, could you test it in your system?
> >
> > Thanks,
> > Zhigang Gong.
> >
> > On Mon, Mar 16, 2015 at 03:39:58PM +0300, Igor Gnatenko wrote:
> >> Hi,
> >>
> >> I'm updating beignet to 1.0.2 in Fedora as packager and see that on
> >> x86_64 I have CMAKE_LIBRARY_ARCHITECTURE equal to nothing and have
> >> intel-beignet-.icd.
> >>
> >> P.S. don't have time to file a bug.
> >> --
> >> -Igor Gnatenko
> >> _______________________________________________
> >> Beignet mailing list
> >> Beignet at lists.freedesktop.org
> >> http://lists.freedesktop.org/mailman/listinfo/beignet
>
>
>
> --
> -Igor Gnatenko
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20150317/361df0f2/attachment.html>

From rebecca_palmer at zoho.com  Tue Mar 17 05:33:15 2015
From: rebecca_palmer at zoho.com (Rebecca N. Palmer)
Date: Tue, 17 Mar 2015 12:33:15 +0000
Subject: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
In-Reply-To: <20150317002917.GH21732@ivb-gt2-rev4>
Message-ID: <55081F0B.90409@zoho.com>

That the file gets named intel-beignet-.icd rather than 
intel-beignet.icd is a purely cosmetic issue: it should work either way, 
but won't support multiarch, as the names of .icd files need to be 
distinct for co-installability but are otherwise irrelevant.

If you want working multiarch, you'll need to actually set 
CMAKE_LIBRARY_ARCHITECTURE (or with the patch, 
BEIGNET_LIBRARY_ARCHITECTURE), e.g. with cmake's -D option; again it 
doesn't matter what you set it to, as long as the values are distinct.


From jbarnes at virtuousgeek.org  Tue Mar 17 08:13:32 2015
From: jbarnes at virtuousgeek.org (Jesse Barnes)
Date: Tue, 17 Mar 2015 08:13:32 -0700
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
	allocation
In-Reply-To: <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com> <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
 <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>
Message-ID: <5508449C.50705@virtuousgeek.org>

>>> Yeah we'll rebind if needed. We can make this an execbuf or context
>>> flag, in either case anything that gets executed by ocl will be moved
>>> around if it accidentally ended up at the wrong place. The only
>>> exception is if a buffer is pinned already, i.e. if you're doing
>>> direct rendering to the frontbuffer. That will give you an EBUSY, but
>>> otoh that also shouldn't ever happen really.
>>
>> Ruiling, are you working on this or someone from your team, presumably based
>> on the patch Chris posted earlier?  The zero page reservation certainly seems
>> simpler to me, but the MAP_FIXED approach is a lot more flexible, and can be
>> used for other types of debug and usages as well (we'll need something like it
>> for OCL pointer sharing for example), so seems like a good thing to pursue
>> regardless.
>>
> Hi Jesse,
> 	MAP_FIXED cannot solve this issue. You may see my previous comments for this topic, 
> There could be many components in on single process, Beignet cannot be guaranteed to be the first one who has allocated address 0.

Yeah, MAP_FIXED sounds a bit more ambitious and though I think it would
work for OCL 2.0 pointer sharing, it's a little different than we were
planning.  To summarize, we have three possible approaches, each with
its own problems:
  1) simple patch to avoid binding at address 0 in PPGTT:
     does impact the ABI (though generally not in a harmful way), and
     may not be possible with aliasing PPGTT with e.g. framebuffers
     bound at offset 0
  2) exposing PIN_BIAS to userspace
     Would allow userspace to avoid pinning any buffers at offset 0 at
     execbuf time, but still has the problem with previously bound buffers
     and aliasing PPGTT
  3) MAP_FIXED interface
     Flexible approach allowing userspace to manage its own virtual
     memory, but still has the same issues with aliasing PPGTT, and with
     shared contexts, which would have to negotiate between libraries how to
     handle the zero page

For (1) and (2) the kernel pieces are really already in place, the main
thing we need is a new flag to userspace to indicate behavior.  I'd
prefer (1) with a context creation flag to indicate "don't bind at 0".
Execbuf would try to honor this, and userspace could check if any
buffers ended up at 0 in the aliasing PPGTT case by checking the
resulting offsets following the call.  I expect in most cases this would
be fine.

It should be pretty easy to extend Ruiling's patch to use a context flag
to determine the behavior; is that something you can do?  Any objections
to this approach?

It does mean that shared contexts need to be handled specially, or won't
get the 0 page protection, but I think Mesa wants this behavior too, and
libva probably wouldn't mind, so you could just require new versions of
those that set this flag when telling people what's supported for proper
NULL pointer handling.

Any objections to that approach?

Thanks,
Jesse

From zhigang.gong at linux.intel.com  Tue Mar 17 20:13:03 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Wed, 18 Mar 2015 11:13:03 +0800
Subject: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
In-Reply-To: <55081F0B.90409@zoho.com>
References: <20150317002917.GH21732@ivb-gt2-rev4> <55081F0B.90409@zoho.com>
Message-ID: <000501d06129$73644950$5a2cdbf0$@linux.intel.com>

Agreed.

Igor, this is all for multiarch support which means installing both 32 bit and 64 bit beignet libraries on the same 64 bit system,
so we need to give different icd name for the two libraries and need to install them to different system directories. Now beignet
relies on CMAKE to provide those predefined variables, and it works fine with Debian and Ubuntu but seems broken on Fedora.

If you haven't planned to support multiarch on Fedora, then the 1.0.2 version should just work fine. And You can apply my patch
to fix the cosmetic icd file name issue.

Thanks,
Zhigang Gong.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Rebecca N. Palmer
> Sent: Tuesday, March 17, 2015 8:33 PM
> To: beignet at lists.freedesktop.org; i.gnatenko.brain at gmail.com
> Subject: Re: [Beignet] wrong CMAKE_LIBRARY_ARCHITECTURE on x86_64
> 
> That the file gets named intel-beignet-.icd rather than intel-beignet.icd is a
> purely cosmetic issue: it should work either way, but won't support multiarch,
> as the names of .icd files need to be distinct for co-installability but are
> otherwise irrelevant.
> 
> If you want working multiarch, you'll need to actually set
> CMAKE_LIBRARY_ARCHITECTURE (or with the patch,
> BEIGNET_LIBRARY_ARCHITECTURE), e.g. with cmake's -D option; again it
> doesn't matter what you set it to, as long as the values are distinct.
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From zhigang.gong at intel.com  Wed Mar 18 00:08:20 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed, 18 Mar 2015 15:08:20 +0800
Subject: [Beignet] [PATCH] runtime: fix a conformance bug in
	cl_get_kernel_arg_info.
Message-ID: <1426662500-30721-1-git-send-email-zhigang.gong@intel.com>

Accordying to OpenCL 1.2 Rev 17:
"CL_KERNEL_ARG_TYPE_CONST is returned if the argument is a pointer and the referenced type is declared with the restrict or const qualifier. For
example, a kernel argument declared as global int const *x returns CL_KERNEL_ARG_TYPE_CONST but a kernel argument declared as global int *
const x does not."

So only need to return CL_KERNEL_ARG_TYPE_CONST for pointer arguments.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 src/cl_kernel.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 331d250..28d88b6 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -221,6 +221,7 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
   assert(k != NULL);
   void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index,
                            param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER);
+  uint32_t arg_type = interp_kernel_get_arg_type(k->opaque, arg_index);
   int str_len = 0;
   cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE;
 
@@ -281,7 +282,10 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
     if (param_value_size_ret)
       *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier);
     if (!param_value) return CL_SUCCESS;
-    if (strstr((char*)ret_info, "const"))
+    if (strstr((char*)ret_info, "const") &&
+         (arg_type == GBE_ARG_GLOBAL_PTR   ||
+          arg_type == GBE_ARG_CONSTANT_PTR ||
+          arg_type == GBE_ARG_LOCAL_PTR))
       type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST;
     if (strstr((char*)ret_info, "volatile"))
       type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
-- 
1.9.1


From chuanbo.weng at intel.com  Wed Mar 18 01:28:34 2015
From: chuanbo.weng at intel.com (Weng, Chuanbo)
Date: Wed, 18 Mar 2015 08:28:34 +0000
Subject: [Beignet] [PATCH] runtime: fix a conformance bug
	in	cl_get_kernel_arg_info.
In-Reply-To: <1426662500-30721-1-git-send-email-zhigang.gong@intel.com>
References: <1426662500-30721-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <5A0E318D73C83C40A09BDBBE131796D701A1E4F5@shsmsx102.ccr.corp.intel.com>

This patch could fix the API/ get_kernel_arg_info fail case in Khronos OpenCL 1.2 conformance test.

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of Zhigang Gong
Sent: Wednesday, March 18, 2015 15:08
To: beignet at lists.freedesktop.org
Cc: Gong, Zhigang
Subject: [Beignet] [PATCH] runtime: fix a conformance bug in cl_get_kernel_arg_info.

Accordying to OpenCL 1.2 Rev 17:
"CL_KERNEL_ARG_TYPE_CONST is returned if the argument is a pointer and the referenced type is declared with the restrict or const qualifier. For example, a kernel argument declared as global int const *x returns CL_KERNEL_ARG_TYPE_CONST but a kernel argument declared as global int * const x does not."

So only need to return CL_KERNEL_ARG_TYPE_CONST for pointer arguments.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 src/cl_kernel.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 331d250..28d88b6 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -221,6 +221,7 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
   assert(k != NULL);
   void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index,
                            param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER);
+  uint32_t arg_type = interp_kernel_get_arg_type(k->opaque, arg_index);
   int str_len = 0;
   cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE;
 
@@ -281,7 +282,10 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
     if (param_value_size_ret)
       *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier);
     if (!param_value) return CL_SUCCESS;
-    if (strstr((char*)ret_info, "const"))
+    if (strstr((char*)ret_info, "const") &&
+         (arg_type == GBE_ARG_GLOBAL_PTR   ||
+          arg_type == GBE_ARG_CONSTANT_PTR ||
+          arg_type == GBE_ARG_LOCAL_PTR))
       type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST;
     if (strstr((char*)ret_info, "volatile"))
       type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
--
1.9.1

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Wed Mar 18 01:02:38 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Wed, 18 Mar 2015 16:02:38 +0800
Subject: [Beignet] [PATCH] Generate NAN for UNDEF value in printf parser.
In-Reply-To: <1426586920-30846-1-git-send-email-junyan.he@inbox.com>
References: <1426586920-30846-1-git-send-email-junyan.he@inbox.com>
Message-ID: <20150318080237.GM21732@ivb-gt2-rev4>

The root cause is that LLVM won't generate NaN for some builtin functions.
LLVM 3.5 will generate 0.0 for SQRT(-1.0) case and LLVM 3.6 will generate
undef.

The related commit of why LLVM won't just simply return NaN for such
case is at:

    Make the sqrt intrinsic return undef for a negative input.

    As discussed here:
    http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20140609/220598.html

    And again here:
    http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-September/077168.html

    The sqrt of a negative number when using the llvm intrinsic is undefined.
    We should return undef rather than 0.0 to match the definition in the LLVM IR lang ref.

    This change should not affect any code that isn't using "no-nans-fp-math";
    ie, no-nans is a requirement for generating the llvm intrinsic in place of a sqrt function call.

    Unfortunately, the behavior introduced by this patch will not match current gcc, xlc, icc, and
    possibly other compilers. The current clang/llvm behavior of returning 0.0 doesn't either.
    We knowingly approve of this difference with the other compilers in an attempt to flag code
    that is invoking undefined behavior.

    A front-end warning should also try to convince the user that the program will fail:
    http://llvm.org/bugs/show_bug.cgi?id=21093

    Differential Revision: http://reviews.llvm.org/D5527

This patch is a workaround for the following scenario:
printf("%f \n", sqrt(-1.0f));

Don't see any chance it will be fixed within LLVM soon, so I will push
this patch.

Thanks.

On Tue, Mar 17, 2015 at 06:08:40PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> llvm 3.6 will give a UNDEF value for NAN. The will cause
> the store instruction for UNDEF to be ignored. We need
> to modify it to NAN here.
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/llvm/llvm_printf_parser.cpp |    6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
> index 7800c01..2f85443 100644
> --- a/backend/src/llvm/llvm_printf_parser.cpp
> +++ b/backend/src/llvm/llvm_printf_parser.cpp
> @@ -831,6 +831,12 @@ error:
>  
>        case Type::DoubleTyID:
>        case Type::FloatTyID: {
> +        /* llvm 3.6 will give a undef value for NAN. */
> +        if (dyn_cast<llvm::UndefValue>(arg)) {
> +          APFloat nan = APFloat::getNaN(APFloat::IEEEsingle, false);
> +          arg = ConstantFP::get(module->getContext(), nan);
> +        }
> +
>          /* Because the printf is a variable parameter function, it does not have the
>             function prototype, so the compiler will always promote the arg to the
>             longest precise type for float. So here, we can always find it is double. */
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From damien.lespiau at intel.com  Wed Mar 18 10:11:34 2015
From: damien.lespiau at intel.com (Damien Lespiau)
Date: Wed, 18 Mar 2015 17:11:34 +0000
Subject: [Beignet] [PATCH] intel: Export total subslice and EU counts
In-Reply-To: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150318171134.GC25699@strange.ger.corp.intel.com>

On Mon, Mar 02, 2015 at 03:39:27PM -0800, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>

2 small details, but otherwise:

Reviewed-by: Damien Lespiau <damien.lespiau at intel.com>


> Update kernel interface with new I915_GETPARAM ioctl entries for
> subslice total and EU total. Add a wrapping function for each
> parameter. Userspace drivers need these values when constructing
> GPGPU commands. This kernel query method is intended to replace
> the PCI ID-based tables that userspace drivers currently maintain.
> The kernel driver can employ fuse register reads as needed to
> ensure the most accurate determination of GT config attributes.
> This first became important with Cherryview in which the config
> could differ between devices with the same PCI ID.
> 
> The kernel detection of these values is device-specific. Userspace
> drivers should continue to maintain ID-based tables for older
> devices which return ENODEV when using this query.

This should probably part of some comment near the API entry point.

> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>
> ---
>  include/drm/i915_drm.h   |  2 ++
>  intel/intel_bufmgr.h     |  4 ++++
>  intel/intel_bufmgr_gem.c | 31 +++++++++++++++++++++++++++++++
>  3 files changed, 37 insertions(+)
> 
> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
> index 15dd01d..e34f5b2 100644
> --- a/include/drm/i915_drm.h
> +++ b/include/drm/i915_drm.h
> @@ -340,6 +340,8 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_HAS_EXEC_HANDLE_LUT   26
>  #define I915_PARAM_HAS_WT     	 	 27
>  #define I915_PARAM_CMD_PARSER_VERSION	 28
> +#define I915_PARAM_SUBSLICE_TOTAL	 32
> +#define I915_PARAM_EU_TOTAL		 33
>  
>  typedef struct drm_i915_getparam {
>  	int param;
> diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
> index be83a56..4b2472e 100644
> --- a/intel/intel_bufmgr.h
> +++ b/intel/intel_bufmgr.h
> @@ -37,6 +37,7 @@
>  #include <stdio.h>
>  #include <stdint.h>
>  #include <stdio.h>
> +#include <stdbool.h>

But you don't seem to use bool or _Bool in the rest of the patch?

>  struct drm_clip_rect;
>  
> @@ -264,6 +265,9 @@ int drm_intel_get_reset_stats(drm_intel_context *ctx,
>  			      uint32_t *active,
>  			      uint32_t *pending);
>  
> +int drm_intel_get_subslice_total(int fd, unsigned int *subslice_total);
> +int drm_intel_get_eu_total(int fd, unsigned int *eu_total);
> +
>  /** @{ Compatibility defines to keep old code building despite the symbol rename
>   * from dri_* to drm_intel_*
>   */
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index 78875fd..2d77f32 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -3292,6 +3292,37 @@ drm_intel_reg_read(drm_intel_bufmgr *bufmgr,
>  	return ret;
>  }
>  
> +drm_public int
> +drm_intel_get_subslice_total(int fd, unsigned int *subslice_total)
> +{
> +	drm_i915_getparam_t gp;
> +	int ret;
> +
> +	memclear(gp);
> +	gp.value = (int*)subslice_total;
> +	gp.param = I915_PARAM_SUBSLICE_TOTAL;
> +	ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
> +	if (ret)
> +		return -errno;
> +
> +	return 0;
> +}
> +
> +drm_public int
> +drm_intel_get_eu_total(int fd, unsigned int *eu_total)
> +{
> +	drm_i915_getparam_t gp;
> +	int ret;
> +
> +	memclear(gp);
> +	gp.value = (int*)eu_total;
> +	gp.param = I915_PARAM_EU_TOTAL;
> +	ret = drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
> +	if (ret)
> +		return -errno;
> +
> +	return 0;
> +}
>  
>  /**
>   * Annotate the given bo for use in aub dumping.
> -- 
> 2.3.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/dri-devel

From damien.lespiau at intel.com  Wed Mar 18 11:18:31 2015
From: damien.lespiau at intel.com (Damien Lespiau)
Date: Wed, 18 Mar 2015 18:18:31 +0000
Subject: [Beignet] [Intel-gfx] [PATCH 1/2 v2] intel: Export total
 subslice and EU counts
In-Reply-To: <1425942784-27957-1-git-send-email-jeff.mcgee@intel.com>
References: <1425339567-18933-1-git-send-email-jeff.mcgee@intel.com>
 <1425942784-27957-1-git-send-email-jeff.mcgee@intel.com>
Message-ID: <20150318181831.GE25699@strange.ger.corp.intel.com>

On Mon, Mar 09, 2015 at 04:13:03PM -0700, jeff.mcgee at intel.com wrote:
> From: Jeff McGee <jeff.mcgee at intel.com>
> 
> Update kernel interface with new I915_GETPARAM ioctl entries for
> subslice total and EU total. Add a wrapping function for each
> parameter. Userspace drivers need these values when constructing
> GPGPU commands. This kernel query method is intended to replace
> the PCI ID-based tables that userspace drivers currently maintain.
> The kernel driver can employ fuse register reads as needed to
> ensure the most accurate determination of GT config attributes.
> This first became important with Cherryview in which the config
> could differ between devices with the same PCI ID.
> 
> The kernel detection of these values is device-specific. Userspace
> drivers should continue to maintain ID-based tables for older
> devices which return ENODEV when using this query.
> 
> v2: remove unnecessary include of <stdbool.h> and increment the
>     I915_GETPARAM indices to match updated kernel patch.
> 
> For: VIZ-4636
> Signed-off-by: Jeff McGee <jeff.mcgee at intel.com>

Pushed to libdrm.

-- 
Damien

From ruiling.song at intel.com  Wed Mar 18 20:22:42 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Thu, 19 Mar 2015 03:22:42 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <5508449C.50705@virtuousgeek.org>
References: <20150305152758.GI18775@phenom.ffwll.local>
 <DF876E69000F0E4DB19B760E3EBA5C7501C5F9C1@SHSMSX103.ccr.corp.intel.com>
 <20150306083919.GH18784@nuc-i3427.alporthouse.com>
 <DF876E69000F0E4DB19B760E3EBA5C7501C607BC@SHSMSX103.ccr.corp.intel.com>
 <20150309120218.GD23680@nuc-i3427.alporthouse.com>
 <5502A9A0.1000209@intel.com> <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
 <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>
 <5508449C.50705@virtuousgeek.org>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7F7BB2@SHSMSX101.ccr.corp.intel.com>


> Yeah, MAP_FIXED sounds a bit more ambitious and though I think it would
> work for OCL 2.0 pointer sharing, it's a little different than we were planning.
> To summarize, we have three possible approaches, each with its own
> problems:
>   1) simple patch to avoid binding at address 0 in PPGTT:
>      does impact the ABI (though generally not in a harmful way), and
>      may not be possible with aliasing PPGTT with e.g. framebuffers
>      bound at offset 0
>   2) exposing PIN_BIAS to userspace
>      Would allow userspace to avoid pinning any buffers at offset 0 at
>      execbuf time, but still has the problem with previously bound buffers
>      and aliasing PPGTT
>   3) MAP_FIXED interface
>      Flexible approach allowing userspace to manage its own virtual
>      memory, but still has the same issues with aliasing PPGTT, and with
>      shared contexts, which would have to negotiate between libraries
> how to
>      handle the zero page
> 
> For (1) and (2) the kernel pieces are really already in place, the main thing we
> need is a new flag to userspace to indicate behavior.  I'd prefer (1) with a
> context creation flag to indicate "don't bind at 0".
> Execbuf would try to honor this, and userspace could check if any buffers
> ended up at 0 in the aliasing PPGTT case by checking the resulting offsets
> following the call.  I expect in most cases this would be fine.
> 
> It should be pretty easy to extend Ruiling's patch to use a context flag to
> determine the behavior; is that something you can do?  Any objections to
> this approach?

I am ok with adding a context flag to indicate "don't bind at 0". Any objections from others?
The patch is not from me, it is from David. I am not familiar with KMD. David, could you help on this patch?

> It does mean that shared contexts need to be handled specially, or won't get
> the 0 page protection, but I think Mesa wants this behavior too, and libva
> probably wouldn't mind, so you could just require new versions of those that
> set this flag when telling people what's supported for proper NULL pointer
> handling.
> 
> Any objections to that approach?
> 
> Thanks,
> Jesse

From xionghu.luo at intel.com  Thu Mar 19 01:14:30 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Thu, 19 Mar 2015 16:14:30 +0800
Subject: [Beignet] [PATCH 1/2] [opencl-2.0] clCreateSampler replaced by
	clCreateSamplerWithProperties.
Message-ID: <1426752871-21915-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

api ungrade for opencl-2.0.
set sampler normalized_coords default value per spec.
---
 src/cl_api.c     | 27 ++++++++++++++++++++++-----
 src/cl_khr_icd.c |  2 +-
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 3e72deb..cd4020e 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -753,15 +753,31 @@ error:
 }
 
 cl_sampler
-clCreateSampler(cl_context         context,
-                cl_bool            normalized,
-                cl_addressing_mode addressing,
-                cl_filter_mode     filter,
-                cl_int *           errcode_ret)
+clCreateSamplerWithProperties(cl_context                      context,
+                              const cl_sampler_properties *   normalized_coords,
+                              cl_int *                        errcode_ret)
 {
   cl_sampler sampler = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_CONTEXT (context);
+  cl_bool normalized = CL_TRUE;
+  cl_addressing_mode addressing = CL_ADDRESS_CLAMP;
+  cl_filter_mode filter = CL_FILTER_NEAREST;
+
+  while(*normalized_coords) {
+    switch (*normalized_coords) {
+    case CL_SAMPLER_NORMALIZED_COORDS:
+      normalized = *(normalized_coords + 1);
+      break;
+    case CL_SAMPLER_ADDRESSING_MODE:
+      addressing = *(normalized_coords + 1);
+      break;
+    case CL_SAMPLER_FILTER_MODE:
+      filter = *(normalized_coords + 1);
+      break;
+    }
+    normalized_coords += 2;
+  }
   sampler = cl_sampler_new(context, normalized, addressing, filter, &err);
 error:
   if (errcode_ret)
@@ -769,6 +785,7 @@ error:
   return sampler;
 }
 
+
 cl_int
 clRetainSampler(cl_sampler sampler)
 {
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 8715bbd..703d875 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -68,7 +68,7 @@ struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
   clGetSupportedImageFormats,
   clGetMemObjectInfo,
   clGetImageInfo,
-  clCreateSampler,
+  clCreateSamplerWithProperties,
   clRetainSampler,
   clReleaseSampler,
   clGetSamplerInfo,
-- 
1.9.1


From xionghu.luo at intel.com  Thu Mar 19 01:14:31 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Thu, 19 Mar 2015 16:14:31 +0800
Subject: [Beignet] [PATCH 2/2] [opencl-2.0] sampler API upgrade for utest.
In-Reply-To: <1426752871-21915-1-git-send-email-xionghu.luo@intel.com>
References: <1426752871-21915-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1426752871-21915-2-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

use clCreateSamplerWithProperties introduced by opencl-2.0.
the CL_SAMPLER_NORMALIZED_COORDS should be CL_FALSE for these cases.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 utests/compare_image_2d_and_1d_array.cpp | 7 ++++++-
 utests/compiler_copy_image.cpp           | 8 +++++++-
 utests/compiler_copy_image1.cpp          | 8 +++++++-
 utests/compiler_copy_image_1d.cpp        | 8 +++++++-
 utests/compiler_copy_image_3d.cpp        | 7 ++++++-
 utests/compiler_movforphi_undef.cpp      | 7 ++++++-
 utests/image_1D_buffer.cpp               | 7 ++++++-
 utests/utest_helper.hpp                  | 4 ++--
 8 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
index dfa4273..b66aa5e 100644
--- a/utests/compare_image_2d_and_1d_array.cpp
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -49,7 +49,12 @@ static void compare_image_2d_and_1d_array(void)
   desc.image_row_pitch = w * sizeof(uint32_t);
   OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2);
 
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_LINEAR);
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_REPEAT,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_LINEAR,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
 
   // Setup kernel and images
   OCL_CREATE_KERNEL("compare_image_2d_and_1d_array");
diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
index 150fd8a..2c58729 100644
--- a/utests/compiler_copy_image.cpp
+++ b/utests/compiler_copy_image.cpp
@@ -29,7 +29,13 @@ static void compiler_copy_image(void)
 
   desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, false,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_REPEAT,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
index 659dddc..0b92d81 100644
--- a/utests/compiler_copy_image1.cpp
+++ b/utests/compiler_copy_image1.cpp
@@ -26,7 +26,13 @@ static void compiler_copy_image1(void)
   desc.image_height = h;
   desc.image_row_pitch = w * sizeof(uint32_t);
   OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_REPEAT,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
 
   desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
diff --git a/utests/compiler_copy_image_1d.cpp b/utests/compiler_copy_image_1d.cpp
index 5af6a77..2b81f87 100644
--- a/utests/compiler_copy_image_1d.cpp
+++ b/utests/compiler_copy_image_1d.cpp
@@ -26,7 +26,13 @@ static void compiler_copy_image_1d(void)
 
   desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_REPEAT,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
diff --git a/utests/compiler_copy_image_3d.cpp b/utests/compiler_copy_image_3d.cpp
index de7cd45..81bc6c6 100644
--- a/utests/compiler_copy_image_3d.cpp
+++ b/utests/compiler_copy_image_3d.cpp
@@ -40,7 +40,12 @@ static void compiler_copy_image_3d(void)
   for(uint32_t i = 0; i < depth; i++)
    OCL_CREATE_IMAGE(buf[2 + i], 0, &format, &desc, NULL);
 
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_REPEAT,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
index 8f1e66e..5f4b9fc 100644
--- a/utests/compiler_movforphi_undef.cpp
+++ b/utests/compiler_movforphi_undef.cpp
@@ -27,7 +27,12 @@ static void compiler_movforphi_undef(void)
 
   desc.image_row_pitch = 0;
   OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
-  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_REPEAT,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
   free(buf_data[0]);
   buf_data[0] = NULL;
 
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
index d8d761f..1bcd2ef 100644
--- a/utests/image_1D_buffer.cpp
+++ b/utests/image_1D_buffer.cpp
@@ -49,7 +49,12 @@ void image_1D_buffer(void)
   OCL_ASSERT(error == CL_SUCCESS);
 
   // Create sampler to use
-  sampler = clCreateSampler(ctx, false, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
+  cl_sampler_properties properties[] = {
+    CL_SAMPLER_NORMALIZED_COORDS, CL_FALSE,
+    CL_SAMPLER_ADDRESSING_MODE, CL_ADDRESS_NONE,
+    CL_SAMPLER_FILTER_MODE, CL_FILTER_NEAREST,
+    0 };
+  OCL_CREATE_SAMPLER_WITH_PROPERTIES(sampler, properties);
   OCL_ASSERT(error == CL_SUCCESS);
 
   cl_mem result_buf = buf[0] = clCreateBuffer(ctx, 0, buffer_sz, NULL, &error);
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 6d09766..b7115e0 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -119,8 +119,8 @@ extern EGLSurface  eglSurface;
 #define OCL_SWAP_EGL_BUFFERS() \
   eglSwapBuffers(eglDisplay, eglSurface);
 
-#define OCL_CREATE_SAMPLER(SAMPLER, ADDRESS_MODE, FILTER_MODE)          \
-    OCL_CALL2(clCreateSampler, SAMPLER, ctx, 0, ADDRESS_MODE, FILTER_MODE)
+#define OCL_CREATE_SAMPLER_WITH_PROPERTIES(SAMPLER, PROPERTIES)          \
+    OCL_CALL2(clCreateSamplerWithProperties, SAMPLER, ctx, PROPERTIES)
 
 #define OCL_MAP_BUFFER(ID) \
     OCL_CALL2(clMapBufferIntel, buf_data[ID], buf[ID])
-- 
1.9.1


From david.weinehall at linux.intel.com  Thu Mar 19 03:09:53 2015
From: david.weinehall at linux.intel.com (David Weinehall)
Date: Thu, 19 Mar 2015 12:09:53 +0200
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <148B1B7A67D1C24B9EF0BE42EA4977062B7F7BB2@SHSMSX101.ccr.corp.intel.com>
References: <5502A9A0.1000209@intel.com>
 <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
 <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>
 <5508449C.50705@virtuousgeek.org>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F7BB2@SHSMSX101.ccr.corp.intel.com>
Message-ID: <20150319100953.GA10425@boom>

On Thu, Mar 19, 2015 at 03:22:42AM +0000, Song, Ruiling wrote:
> 
> > Yeah, MAP_FIXED sounds a bit more ambitious and though I think it would
> > work for OCL 2.0 pointer sharing, it's a little different than we were planning.
> > To summarize, we have three possible approaches, each with its own
> > problems:
> >   1) simple patch to avoid binding at address 0 in PPGTT:
> >      does impact the ABI (though generally not in a harmful way), and
> >      may not be possible with aliasing PPGTT with e.g. framebuffers
> >      bound at offset 0
> >   2) exposing PIN_BIAS to userspace
> >      Would allow userspace to avoid pinning any buffers at offset 0 at
> >      execbuf time, but still has the problem with previously bound buffers
> >      and aliasing PPGTT
> >   3) MAP_FIXED interface
> >      Flexible approach allowing userspace to manage its own virtual
> >      memory, but still has the same issues with aliasing PPGTT, and with
> >      shared contexts, which would have to negotiate between libraries
> > how to
> >      handle the zero page
> > 
> > For (1) and (2) the kernel pieces are really already in place, the main thing we
> > need is a new flag to userspace to indicate behavior.  I'd prefer (1) with a
> > context creation flag to indicate "don't bind at 0".
> > Execbuf would try to honor this, and userspace could check if any buffers
> > ended up at 0 in the aliasing PPGTT case by checking the resulting offsets
> > following the call.  I expect in most cases this would be fine.
> > 
> > It should be pretty easy to extend Ruiling's patch to use a context flag to
> > determine the behavior; is that something you can do?  Any objections to
> > this approach?
> 
> I am ok with adding a context flag to indicate "don't bind at 0". Any objections from others?
> The patch is not from me, it is from David. I am not familiar with KMD. David, could you help on this patch?

Yup, assuming, of course, that such an approach is acceptable.


Kind regards, David

From daniel at ffwll.ch  Thu Mar 19 07:58:49 2015
From: daniel at ffwll.ch (Daniel Vetter)
Date: Thu, 19 Mar 2015 15:58:49 +0100
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150319100953.GA10425@boom>
References: <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
 <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>
 <5508449C.50705@virtuousgeek.org>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F7BB2@SHSMSX101.ccr.corp.intel.com>
 <20150319100953.GA10425@boom>
Message-ID: <20150319145849.GU31422@phenom.ffwll.local>

On Thu, Mar 19, 2015 at 12:09:53PM +0200, David Weinehall wrote:
> On Thu, Mar 19, 2015 at 03:22:42AM +0000, Song, Ruiling wrote:
> > 
> > > Yeah, MAP_FIXED sounds a bit more ambitious and though I think it would
> > > work for OCL 2.0 pointer sharing, it's a little different than we were planning.
> > > To summarize, we have three possible approaches, each with its own
> > > problems:
> > >   1) simple patch to avoid binding at address 0 in PPGTT:
> > >      does impact the ABI (though generally not in a harmful way), and
> > >      may not be possible with aliasing PPGTT with e.g. framebuffers
> > >      bound at offset 0
> > >   2) exposing PIN_BIAS to userspace
> > >      Would allow userspace to avoid pinning any buffers at offset 0 at
> > >      execbuf time, but still has the problem with previously bound buffers
> > >      and aliasing PPGTT
> > >   3) MAP_FIXED interface
> > >      Flexible approach allowing userspace to manage its own virtual
> > >      memory, but still has the same issues with aliasing PPGTT, and with
> > >      shared contexts, which would have to negotiate between libraries
> > > how to
> > >      handle the zero page
> > > 
> > > For (1) and (2) the kernel pieces are really already in place, the main thing we
> > > need is a new flag to userspace to indicate behavior.  I'd prefer (1) with a
> > > context creation flag to indicate "don't bind at 0".
> > > Execbuf would try to honor this, and userspace could check if any buffers
> > > ended up at 0 in the aliasing PPGTT case by checking the resulting offsets
> > > following the call.  I expect in most cases this would be fine.
> > > 
> > > It should be pretty easy to extend Ruiling's patch to use a context flag to
> > > determine the behavior; is that something you can do?  Any objections to
> > > this approach?
> > 
> > I am ok with adding a context flag to indicate "don't bind at 0". Any objections from others?
> > The patch is not from me, it is from David. I am not familiar with KMD. David, could you help on this patch?
> 
> Yup, assuming, of course, that such an approach is acceptable.

Yeah my big concern was with not making this opt-in like the old patch or
adding an interface which does a lot more than what we need right now
(Chris' patch). Just a bitflag to ask for this seems best and is fine with
me.

And for the implementation I think we should reuse the PIN_BIAS logic
since that'll work in all places where it's possible. One open from my
side is how we should handle failures to move buffers (in case they ended
up at 0 somehow) - we can either silently fail or return an error to
userspace.

Note that this is only possible if you render to an elg image from ocl,
and if that egl image is a pinned frontbuffer and if we don't have full
ppgtt support. I don't know what the spec requires us to do here, or
whether we should care at all.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

From brian at arrayfire.com  Thu Mar 19 11:07:54 2015
From: brian at arrayfire.com (Brian Kloppenborg)
Date: Thu, 19 Mar 2015 14:07:54 -0400
Subject: [Beignet] [PATCH] BUGFIX: Prohibit 'make package' from doing system
	install of ICD vendor file
Message-ID: <1426788474-3046-1-git-send-email-brian@arrayfire.com>

As presently written, a 'make package' will attempt to INSTALL the
Beignet ICD loader to /etc/OpenCL/vendors whereas it should just
do a local install and then package the file. The proposed change instructs
CPack to include the `DESTDIR` variable when it calls `make install`, thus
directing the desination for the ICD loader to a local directory instead
of a system path.
---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e11a3d0..a230e4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,6 +234,7 @@ IF(BUILD_EXAMPLES)
 ADD_SUBDIRECTORY(examples)
 ENDIF(BUILD_EXAMPLES)
 
+SET(CPACK_SET_DESTDIR ON)
 SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
 SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
 SET(CPACK_PACKAGE_VERSION_PATCH "${LIBCL_DRIVER_VERSION_PATCH}")
-- 
2.1.0


From david.couturier at polymtl.ca  Thu Mar 19 17:19:49 2015
From: david.couturier at polymtl.ca (David Couturier)
Date: Thu, 19 Mar 2015 20:19:49 -0400
Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
 command already marked CL_COMPLETE
Message-ID: <550B67A5.6070207@polymtl.ca>

When trying to register a callback on the clEnqueueReadBuffer command, 
since it is processed
synchroniously all the time, the command was marked CL_COMPLETE every 
time. If the event returned
by clEnqueueReadBuffer was then used to register a callback function, 
the callback function did
no check to execute it if nessary.

Fixed by adding a check at the end of the cl_event_set_callback function.

All tests passed.

Signed-off-by: David Couturier <david.couturier at polymtl.ca>
---
  src/cl_event.c | 15 +++++++++++++++
  1 file changed, 15 insertions(+)

diff --git a/src/cl_event.c b/src/cl_event.c
index f70e531..df4a5a5 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -183,6 +183,21 @@ cl_int cl_event_set_callback(cl_event event ,
    cb->next        = event->user_cb;
    event->user_cb  = cb;

+  // It is possible that the event enqueued is already completed.
+  // clEnqueueReadBuffer can be synchronious and when the callback
+  // is registered after, it still needs to get executed.
+  if(event->status == CL_COMPLETE) {
+         /* Call user callback */
+         user_callback *user_cb = event->user_cb;
+         while(user_cb) {
+                 if(user_cb->status >= CL_COMPLETE) {
+                         user_cb->executed = CL_TRUE;
+                         user_cb->pfn_notify(event, event->status, 
user_cb->user_data);
+                 }
+                 user_cb = user_cb->next;
+         }
+  }
+
  exit:
    return err;
  error:
-- 
1.9.1

From ruiling.song at intel.com  Thu Mar 19 20:01:43 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Fri, 20 Mar 2015 03:01:43 +0000
Subject: [Beignet] [Intel-gfx] Preventing zero GPU virtual address
 allocation
In-Reply-To: <20150319145849.GU31422@phenom.ffwll.local>
References: <20150313092738.GD3800@phenom.ffwll.local>
 <20150313165847.GA31491@nuc-i3427.alporthouse.com>
 <20150313171339.GL3800@phenom.ffwll.local>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F5DFE@SHSMSX101.ccr.corp.intel.com>
 <20150316085214.GF21993@phenom.ffwll.local>
 <550738B4.8060405@virtuousgeek.org>
 <DF876E69000F0E4DB19B760E3EBA5C7501C88E38@SHSMSX103.ccr.corp.intel.com>
 <5508449C.50705@virtuousgeek.org>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F7BB2@SHSMSX101.ccr.corp.intel.com>
 <20150319100953.GA10425@boom> <20150319145849.GU31422@phenom.ffwll.local>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7F8F5C@SHSMSX101.ccr.corp.intel.com>


> Yeah my big concern was with not making this opt-in like the old patch or
> adding an interface which does a lot more than what we need right now
> (Chris' patch). Just a bitflag to ask for this seems best and is fine with me.
> 
> And for the implementation I think we should reuse the PIN_BIAS logic since
> that'll work in all places where it's possible. One open from my side is how
> we should handle failures to move buffers (in case they ended up at 0
> somehow) - we can either silently fail or return an error to userspace.
> 
> Note that this is only possible if you render to an elg image from ocl, and if
> that egl image is a pinned frontbuffer and if we don't have full ppgtt support.
> I don't know what the spec requires us to do here, or whether we should
> care at all.
So the situation you mentioned only comes up when a pinned buffer under global gtt?
Under global gtt, a buffer would rarely binded at offset 0, in fact the most often cases are under ppgtt.
So, I think silent ignore moving pinned buffer under gtt is acceptable. I will add the check for the binded at zero case in beignet.

Ruiling
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From rong.r.yang at intel.com  Thu Mar 19 22:10:53 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Fri, 20 Mar 2015 05:10:53 +0000
Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
 command already marked CL_COMPLETE
In-Reply-To: <550B67A5.6070207@polymtl.ca>
References: <550B67A5.6070207@polymtl.ca>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090141E537@shsmsx102.ccr.corp.intel.com>

One comment. Thanks find and fix it.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> David Couturier
> Sent: Friday, March 20, 2015 08:20
> To: Zou, Nanhai
> Cc: beignet at lists.freedesktop.org
> Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
> command already marked CL_COMPLETE
> 
> When trying to register a callback on the clEnqueueReadBuffer command,
> since it is processed synchroniously all the time, the command was marked
> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> was then used to register a callback function, the callback function did no
> check to execute it if nessary.
> 
> Fixed by adding a check at the end of the cl_event_set_callback function.
> 
> All tests passed.
> 
> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> ---
>   src/cl_event.c | 15 +++++++++++++++
>   1 file changed, 15 insertions(+)
> 
> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..df4a5a5 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -183,6 +183,21 @@ cl_int cl_event_set_callback(cl_event event ,
>     cb->next        = event->user_cb;
>     event->user_cb  = cb;
> 
> +  // It is possible that the event enqueued is already completed.
> +  // clEnqueueReadBuffer can be synchronious and when the callback  //
> + is registered after, it still needs to get executed.
> +  if(event->status == CL_COMPLETE) {
> +         /* Call user callback */
> +         user_callback *user_cb = event->user_cb;
> +         while(user_cb) {
> +                 if(user_cb->status >= CL_COMPLETE) { 
> +                         user_cb->executed = CL_TRUE;
> +                         user_cb->pfn_notify(event, event->status,
> user_cb->user_data);
> +                 }
> +                 user_cb = user_cb->next;
> +         } 

I think only the current callback should be called. Assume the scenario:
clEnqueueReadBuffer(......,ev);
clSetEventCallback(ev, CL_SUBMITTED, ...);
clSetEventCallback(ev, CL_COMPLETE, ....);
In the second clSetEventCallback, the first callback have been executed, only need execute the second callback.
So need execute current callback when the event's status <= command_exec_callback_type.

> +  }
> +
>   exit:
>     return err;
>   error:
> --
> 1.9.1
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From yejun.guo at intel.com  Thu Mar 19 22:57:33 2015
From: yejun.guo at intel.com (Guo Yejun)
Date: Fri, 20 Mar 2015 13:57:33 +0800
Subject: [Beignet] [PATCH 1/2] add 3 simd level built-in functions: shuffle,
	simdsize and simdid
Message-ID: <1426831053-8431-1-git-send-email-yejun.guo@intel.com>

uint __gen_ocl_get_simd_size();
returns 8 if SIMD8, returns 16 if SIMD16

uint __gen_ocl_get_simd_id();
return value ranges from 0 to simdsize - 1

floatN __gen_ocl_simd_shuffle(floatN x, uint c);
intN   __gen_ocl_simd_shuffle(intN x, uint c);
uintN  __gen_ocl_simd_shuffle(uintN x, uint c);
the value of x of the c-th channel of the SIMD is returned, for all SIMD channels,
the behavior is undefined if c is larger than simdsize - 1

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 backend/src/backend/gen8_context.cpp               |  29 ++++-
 backend/src/backend/gen_context.cpp                | 127 +++++++++++++++------
 backend/src/backend/gen_context.hpp                |   1 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   1 +
 backend/src/backend/gen_insn_selection.cpp         |  60 ++++++++++
 backend/src/backend/gen_insn_selection.hxx         |   2 +
 backend/src/backend/program.h                      |   1 +
 backend/src/ir/context.hpp                         |   6 +
 backend/src/ir/instruction.cpp                     |  32 ++++++
 backend/src/ir/instruction.hpp                     |  17 +++
 backend/src/ir/instruction.hxx                     |   3 +
 backend/src/ir/liveness.cpp                        |   5 +
 backend/src/ir/profile.cpp                         |   2 +
 backend/src/ir/profile.hpp                         |   5 +-
 backend/src/libocl/CMakeLists.txt                  |   2 +-
 backend/src/libocl/include/ocl.h                   |   1 +
 backend/src/libocl/include/ocl_misc.h              |   8 --
 backend/src/libocl/script/ocl_simd.def             |   4 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |  19 +++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            |  34 ++++++
 backend/src/llvm/llvm_gen_backend.cpp              |  27 +++++
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   4 +
 src/cl_command_queue_gen7.c                        |   8 ++
 23 files changed, 351 insertions(+), 47 deletions(-)
 create mode 100644 backend/src/libocl/script/ocl_simd.def
 create mode 100644 backend/src/libocl/tmpl/ocl_simd.tmpl.cl
 create mode 100644 backend/src/libocl/tmpl/ocl_simd.tmpl.h

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 3f57cf6..144fd00 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -240,6 +240,9 @@ namespace gbe
   }
 
   void Gen8Context::emitBinaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
       case SEL_OP_SEL_INT64:
       case SEL_OP_I64AND:
@@ -250,14 +253,34 @@ namespace gbe
         break;
       case SEL_OP_UPSAMPLE_LONG:
       {
-        const GenRegister dst = ra->genReg(insn.dst(0));
-        const GenRegister src0 = ra->genReg(insn.src(0));
-        const GenRegister src1 = ra->genReg(insn.src(1));
         p->MOV(dst, src0);
         p->SHL(dst, dst, GenRegister::immud(32));
         p->ADD(dst, dst, src1);
         break;
       }
+      case SEL_OP_SIMD_SHUFFLE:
+      {
+        uint32_t simd = p->curr.execWidth;
+        if (src1.file == GEN_IMMEDIATE_VALUE) {
+          uint32_t offset = src1.value.ud % simd;
+          uint32_t nr = src0.nr;
+          uint32_t subnr = src0.subnr;
+          subnr = subnr + offset;
+          if (subnr > 8) {
+            nr = nr + 1;
+            subnr = subnr - 8;
+          }
+          p->MOV(dst, GenRegister::ud1grf(nr, subnr));
+        } else {
+          uint32_t base = src0.nr * 32 + src0.subnr * 4;
+          GenRegister baseReg = GenRegister::immuw(base);
+          const GenRegister a0 = GenRegister::addr8(0);
+          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+          p->MOV(dst, indirect);
+        }
+        break;
+      }
       default:
         GenContext::emitBinaryInstruction(insn);
     }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index cdf581c..25c7a5a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -198,6 +198,22 @@ namespace gbe
     this->labelPos.insert(std::make_pair(label, p->store.size()));
   }
 
+  void GenContext::emitNullaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    switch (insn.opcode) {
+      case SEL_OP_SIMD_ID:
+        {
+          const GenRegister selLaneID = this->simdWidth == 8 ?
+                                GenRegister::ud8grf(ir::ocl::laneid) :
+                                GenRegister::ud16grf(ir::ocl::laneid);
+          const GenRegister laneID = ra->genReg(selLaneID);
+          p->MOV(dst, laneID);
+        }
+        break;
+      default: NOT_IMPLEMENTED;
+    }
+  }
+
   void GenContext::emitUnaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0));
@@ -583,6 +599,46 @@ namespace gbe
           p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
         }
         break;
+      case SEL_OP_SIMD_SHUFFLE:
+        {
+          uint32_t simd = p->curr.execWidth;
+          if (src1.file == GEN_IMMEDIATE_VALUE) {
+            uint32_t offset = src1.value.ud % simd;
+            uint32_t nr = src0.nr;
+            uint32_t subnr = src0.subnr;
+            subnr = subnr + offset;
+            if (subnr > 8) {
+              nr = nr + 1;
+              subnr = subnr - 8;
+            }
+            p->MOV(dst, GenRegister::ud1grf(nr, subnr));
+          } else {
+            uint32_t base = src0.nr * 32 + src0.subnr * 4;
+            GenRegister baseReg = GenRegister::immuw(base);
+            const GenRegister a0 = GenRegister::addr8(0);
+
+            p->push();
+              if (simd == 8) {
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+                p->MOV(dst, indirect);
+              }
+              else if (simd == 16) {
+                p->curr.execWidth = 8;
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+                p->MOV(dst, indirect);
+
+                p->curr.quarterControl = 1;
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+              }
+              else
+                NOT_IMPLEMENTED;
+            p->pop();
+          }
+        }
+        break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -2023,41 +2079,46 @@ namespace gbe
     } else
   
     fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const ir::Register reg = insn.getSrc(srcID);
-        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
+      if (insn.getOpcode() == ir::OP_SIMD_ID) {
+        if (curbeRegs.find(laneid) == curbeRegs.end())
+          allocCurbeReg(laneid, GBE_CURBE_LANE_ID);
+      } else {
+        const uint32_t srcNum = insn.getSrcNum();
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const ir::Register reg = insn.getSrc(srcID);
+          if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+            if (srcID != 0) continue;
+            const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+            const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+            ir::ImageInfoKey key(bti, type);
+            const ir::Register imageInfo = insn.getSrc(0);
+            if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+              uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+              insertCurbeReg(imageInfo, offset);
+            }
+            continue;
           }
-          continue;
+          if (fn.isSpecialReg(reg) == false) continue;
+          if (curbeRegs.find(reg) != curbeRegs.end()) continue;
+          if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
+          INSERT_REG(lsize0, LOCAL_SIZE_X)
+          INSERT_REG(lsize1, LOCAL_SIZE_Y)
+          INSERT_REG(lsize2, LOCAL_SIZE_Z)
+          INSERT_REG(gsize0, GLOBAL_SIZE_X)
+          INSERT_REG(gsize1, GLOBAL_SIZE_Y)
+          INSERT_REG(gsize2, GLOBAL_SIZE_Z)
+          INSERT_REG(goffset0, GLOBAL_OFFSET_X)
+          INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
+          INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
+          INSERT_REG(workdim, WORK_DIM)
+          INSERT_REG(numgroup0, GROUP_NUM_X)
+          INSERT_REG(numgroup1, GROUP_NUM_Y)
+          INSERT_REG(numgroup2, GROUP_NUM_Z)
+          INSERT_REG(stackptr, STACK_POINTER)
+          INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
+          INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+          do {} while(0);
         }
-        if (fn.isSpecialReg(reg) == false) continue;
-        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
-        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
-        INSERT_REG(lsize0, LOCAL_SIZE_X)
-        INSERT_REG(lsize1, LOCAL_SIZE_Y)
-        INSERT_REG(lsize2, LOCAL_SIZE_Z)
-        INSERT_REG(gsize0, GLOBAL_SIZE_X)
-        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
-        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
-        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
-        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
-        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
-        INSERT_REG(workdim, WORK_DIM)
-        INSERT_REG(numgroup0, GROUP_NUM_X)
-        INSERT_REG(numgroup1, GROUP_NUM_Y)
-        INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(stackptr, STACK_POINTER)
-        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
-        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
-        do {} while(0);
       }
     });
 #undef INSERT_REG
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6ca88db..3ac675e 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -124,6 +124,7 @@ namespace gbe
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
+    virtual void emitNullaryInstruction(const SelectionInstruction &insn);
     virtual void emitUnaryInstruction(const SelectionInstruction &insn);
     virtual void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
     virtual void emitBinaryInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index d054820..fd7e1a4 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,5 +1,6 @@
 //                 Family     Latency     SIMD16     SIMD8
 DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
+DECL_GEN7_SCHEDULE(Nullary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
 DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index c240261..1586098 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -477,6 +477,8 @@ namespace gbe
     /*! To make function prototypes more readable */
     typedef const GenRegister &Reg;
 
+#define ALU0(OP) \
+  INLINE void OP(Reg dst) { ALU0(SEL_OP_##OP, dst); }
 #define ALU1(OP) \
   INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
 #define ALU1WithTemp(OP) \
@@ -530,12 +532,15 @@ namespace gbe
     ALU2WithTemp(HADD)
     ALU2WithTemp(RHADD)
     ALU2(UPSAMPLE_LONG)
+    ALU2(SIMD_SHUFFLE)
+    ALU0(SIMD_ID)
     ALU1WithTemp(CONVI_TO_I64)
     ALU1WithTemp(CONVF_TO_I64)
     ALU1(CONVI64_TO_I)
     I64Shift(I64SHL)
     I64Shift(I64SHR)
     I64Shift(I64ASR)
+#undef ALU0
 #undef ALU1
 #undef ALU1WithTemp
 #undef ALU2
@@ -622,6 +627,8 @@ namespace gbe
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
     void MATH(Reg dst, uint32_t function, Reg src);
+    /*! Encode nullary instructions */
+    void ALU0(SelectionOpcode opcode, Reg dst);
     /*! Encode unary instructions */
     void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
     /*! Encode unary with temp reg instructions */
@@ -1435,6 +1442,11 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::ALU0(SelectionOpcode opcode, Reg dst) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 1, 0);
+    insn->dst(0) = dst;
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
@@ -2054,6 +2066,42 @@ namespace gbe
 #define DECL_CTOR(FAMILY, INSN_NUM, COST) \
   FAMILY##Pattern(void) : OneToManyPattern<FAMILY##Pattern, ir::FAMILY>(INSN_NUM, COST) {}
 
+  /*! Nullary instruction patterns */
+  class NullaryInstructionPattern : public SelectionPattern
+  {
+  public:
+    NullaryInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::NullaryInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::NullaryInstruction &insn = cast<NullaryInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      const Type type = insn.getType();
+      GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+      sel.push();
+      switch (opcode) {
+        case ir::OP_SIMD_SIZE:
+          {
+            const GenRegister src = GenRegister::immud(sel.curr.execWidth);
+            sel.curr.execWidth = 1;
+            sel.MOV(dst, src);
+          }
+          break;
+        case ir::OP_SIMD_ID:
+          sel.SIMD_ID(dst);
+          break;
+        default: NOT_SUPPORTED;
+      }
+      sel.pop();
+      return true;
+    }
+  };
+
   /*! Unary instruction patterns */
   DECL_PATTERN(UnaryInstruction)
   {
@@ -2563,6 +2611,17 @@ namespace gbe
         case OP_UPSAMPLE_LONG:
           sel.UPSAMPLE_LONG(dst, src0, src1);
           break;
+        case OP_SIMD_SHUFFLE:
+          {
+            if (src1.file == GEN_IMMEDIATE_VALUE) {
+              sel.SIMD_SHUFFLE(dst, src0, src1);
+            } else {
+              GenRegister shiftL = GenRegister::udxgrf(sel.curr.execWidth, sel.reg(FAMILY_DWORD));
+              sel.SHL(shiftL, src1, GenRegister::immud(0x2));
+              sel.SIMD_SHUFFLE(dst, src0, shiftL);
+            }
+          }
+          break;
         default: NOT_IMPLEMENTED;
       }
       sel.pop();
@@ -4789,6 +4848,7 @@ namespace gbe
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
+    this->insert<NullaryInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 09f5aaf..87ccee3 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -77,6 +77,8 @@ DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
 DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
 DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(SIMD_SHUFFLE, BinaryInstruction)
+DECL_SELECTION_IR(SIMD_ID, NullaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
 DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
 DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index dc5662f..c4023ec 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -99,6 +99,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
+  GBE_CURBE_LANE_ID,
   GBE_CURBE_SLM_OFFSET,
 };
 
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index cf5109d..af65ff3 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -176,6 +176,12 @@ namespace ir {
     DECL_THREE_SRC_INSN(MAD);
 #undef DECL_THREE_SRC_INSN
 
+    /*! For all nullary functions */
+    void ALU0(Opcode opcode, Type type, Register dst) {
+      const Instruction insn = gbe::ir::ALU0(opcode, type, dst);
+      this->append(insn);
+    }
+
     /*! For all unary functions */
     void ALU1(Opcode opcode, Type type, Register dst, Register src) {
       const Instruction insn = gbe::ir::ALU1(opcode, type, dst, src);
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 797552f..9c3331b 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -131,6 +131,17 @@ namespace ir {
       Register src[srcNum]; //!< Indices of the sources
     };
 
+    /*! All 0-source arithmetic instructions */
+    class ALIGNED_INSTRUCTION NullaryInstruction : public NaryInstruction<0>
+    {
+    public:
+      NullaryInstruction(Opcode opcode, Type type, Register dst) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+      }
+    };
+
     /*! All 1-source arithmetic instructions */
     class ALIGNED_INSTRUCTION UnaryInstruction : public NaryInstruction<1>
     {
@@ -1305,6 +1316,10 @@ namespace ir {
     }; \
   }
 
+START_INTROSPECTION(NullaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(NullaryInstruction)
+
 START_INTROSPECTION(UnaryInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(UnaryInstruction)
@@ -1532,6 +1547,7 @@ END_FUNCTION(Instruction, Register)
     return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
   }
 
+DECL_MEM_FN(NullaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
@@ -1586,6 +1602,21 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   // Implements the emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  // For all nullary functions with given opcode
+  Instruction ALU0(Opcode opcode, Type type, Register dst) {
+    return internal::NullaryInstruction(opcode, type, dst).convert();
+  }
+
+  // All unary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst) { \
+    return ALU0(OP_##NAME, type, dst);\
+  }
+
+  DECL_EMIT_FUNCTION(SIMD_SIZE)
+
+#undef DECL_EMIT_FUNCTION
+
   // For all unary functions with given opcode
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src) {
     return internal::UnaryInstruction(opcode, type, dst, src).convert();
@@ -1645,6 +1676,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   DECL_EMIT_FUNCTION(RHADD)
   DECL_EMIT_FUNCTION(I64HADD)
   DECL_EMIT_FUNCTION(I64RHADD)
+  DECL_EMIT_FUNCTION(SIMD_SHUFFLE)
 
 #undef DECL_EMIT_FUNCTION
 
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 24d27aa..6dd3e81 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -198,6 +198,15 @@ namespace ir {
   /*! Output the instruction string in the given stream */
   std::ostream &operator<< (std::ostream &out, const Instruction &proxy);
 
+  /*! Nullary instruction instructions are typed. */
+  class NullaryInstruction : public Instruction {
+  public:
+    /*! Get the type manipulated by the instruction */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Unary instructions are typed. dst and sources share the same type */
   class UnaryInstruction : public Instruction {
   public:
@@ -558,6 +567,12 @@ namespace ir {
   /// All emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  /*! alu0.type dst */
+  Instruction ALU0(Opcode opcode, Type type, Register dst);
+  /*! simd_size.type dst */
+  Instruction SIMD_SIZE(Type type, Register dst);
+  /*! simd_id.type dst */
+  Instruction SIMD_ID(Type type, Register dst);
   /*! alu1.type dst src */
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src);
   /*! mov.type dst src */
@@ -670,6 +685,8 @@ namespace ir {
   Instruction GT(Type type, Register dst, Register src0, Register src1);
   /*! ord.type dst src0 src1 */
   Instruction ORD(Type type, Register dst, Register src0, Register src1);
+  /*! simd_shuffle.type dst src0 src1 */
+  Instruction SIMD_SHUFFLE(Type type, Register dst, Register src0, Register src1);
   /*! BITCAST.{dstType <- srcType} dst src */
   Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
   /*! cvt.{dstType <- srcType} dst src */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index de4abfb..76269bd 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -25,6 +25,8 @@
  * \file instruction.hxx
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
+DECL_INSN(SIMD_SIZE, NullaryInstruction)
+DECL_INSN(SIMD_ID, NullaryInstruction)
 DECL_INSN(MOV, UnaryInstruction)
 DECL_INSN(COS, UnaryInstruction)
 DECL_INSN(SIN, UnaryInstruction)
@@ -57,6 +59,7 @@ DECL_INSN(BSB, BinaryInstruction)
 DECL_INSN(OR, BinaryInstruction)
 DECL_INSN(XOR, BinaryInstruction)
 DECL_INSN(AND, BinaryInstruction)
+DECL_INSN(SIMD_SHUFFLE, BinaryInstruction)
 DECL_INSN(SEL, SelectInstruction)
 DECL_INSN(EQ, CompareInstruction)
 DECL_INSN(NE, CompareInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index 2b1ffdb..26c4129 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -66,6 +66,11 @@ namespace ir {
         const uint32_t srcNum = insn.getSrcNum();
         const uint32_t dstNum = insn.getDstNum();
         bool uniform = true;
+
+        //have no way to decide the dst uniform if there is no source
+        if (srcNum == 0)
+          uniform = false;
+
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const Register reg = insn.getSrc(srcID);
           if (!fn.isUniformRegister(reg))
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 4c272bd..55aedb4 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,6 +43,7 @@ namespace ir {
         "zero", "one",
         "retVal", "slm_offset",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "lane_id",
         "invalid"
     };
 
@@ -86,6 +87,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 7259d9f..d310128 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,8 +71,9 @@ namespace ir {
     static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
     static const Register printfbptr = Register(28); // printf buffer address .
     static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register invalid = Register(30);  // used for valid comparation.
-    static const uint32_t regNum = 31;             // number of special registers
+    static const Register laneid = Register(30); // printf index buffer address.
+    static const Register invalid = Register(31);  // used for valid comparation.
+    static const uint32_t regNum = 32;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 16f00ee..623affc 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -90,7 +90,7 @@ MACRO(GENERATE_SOURCE_PY _mod)
 	)
 ENDMACRO(GENERATE_SOURCE_PY)
 
-SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math)
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math ocl_simd)
 FOREACH(M ${OCL_PY_GENERATED_MODULES})
     GENERATE_HEADER_PY(${M})
     GENERATE_SOURCE_PY(${M})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index e886670..a53f4c0 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -30,6 +30,7 @@
 #include "ocl_image.h"
 #include "ocl_integer.h"
 #include "ocl_math.h"
+#include "ocl_simd.h"
 #include "ocl_misc.h"
 #include "ocl_printf.h"
 #include "ocl_relational.h"
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index aa3f504..359025b 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -128,14 +128,6 @@ DEF(ulong)
 #undef DEC16
 #undef DEC16X
 
-
-/* Temp to add the SIMD functions here. */
-/////////////////////////////////////////////////////////////////////////////
-// SIMD level function
-/////////////////////////////////////////////////////////////////////////////
-short __gen_ocl_simd_any(short);
-short __gen_ocl_simd_all(short);
-
 struct time_stamp {
   // time tick
   ulong tick;
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
new file mode 100644
index 0000000..ccda619
--- /dev/null
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -0,0 +1,4 @@
+##simd level functions
+floatn __gen_ocl_simd_shuffle(floatn x, uint c)
+intn __gen_ocl_simd_shuffle(intn x, uint c)
+uintn __gen_ocl_simd_shuffle(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
new file mode 100644
index 0000000..b9da5e2
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -0,0 +1,19 @@
+/*
+ * Copyright @ 2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "ocl_simd.h"
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
new file mode 100644
index 0000000..42afc7b
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_SIMD_H__
+#define __OCL_SIMD_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// SIMD level function
+/////////////////////////////////////////////////////////////////////////////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+uint __gen_ocl_get_simd_size(void);
+uint __gen_ocl_get_simd_id(void);
+
+OVERLOADABLE float __gen_ocl_simd_shuffle(float x, uint c);
+OVERLOADABLE int __gen_ocl_simd_shuffle(int x, uint c);
+OVERLOADABLE uint __gen_ocl_simd_shuffle(uint x, uint c);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index bf03a13..4fcb8bb 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2790,10 +2790,17 @@ namespace gbe
       case GEN_OCL_CONV_F32_TO_F16:
       case GEN_OCL_SIMD_ANY:
       case GEN_OCL_SIMD_ALL:
+      case GEN_OCL_SIMD_SHUFFLE:
       case GEN_OCL_READ_TM:
       case GEN_OCL_REGION:
         this->newRegister(&I);
         break;
+      case GEN_OCL_SIMD_SIZE:
+        this->newRegister(&I, NULL, true);
+        break;
+      case GEN_OCL_SIMD_ID:
+        this->newRegister(&I, NULL, false);
+        break;
       case GEN_OCL_PRINTF:
         break;
       default:
@@ -3053,6 +3060,26 @@ namespace gbe
             ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
             break;
           }
+          case GEN_OCL_SIMD_SIZE:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_SIZE, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_ID:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_ID, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_SHUFFLE:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.SIMD_SHUFFLE(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_READ_TM:
           {
             const ir::Register dst = this->getRegister(&I);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 9536a3c..714a293 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -155,6 +155,10 @@ DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
 DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
 DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
 
+DECL_LLVM_GEN_FUNCTION(SIMD_SIZE, __gen_ocl_get_simd_size)
+DECL_LLVM_GEN_FUNCTION(SIMD_ID, __gen_ocl_get_simd_id)
+DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, __gen_ocl_simd_shuffle)
+
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
 DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 253c4f2..3f73de0 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -202,6 +202,14 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
 #undef UPLOAD
 
+  /* __gen_ocl_get_simd_id needs it */
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LANE_ID, 0)) >= 0) {
+    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
+    uint32_t *laneid = (uint32_t *) (ker->curbe + offset);
+    int32_t i;
+    for (i = 0; i < (int32_t) simd_sz; ++i) laneid[i] = i;
+  }
+
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
-- 
1.9.1


From yejun.guo at intel.com  Thu Mar 19 22:58:47 2015
From: yejun.guo at intel.com (Guo Yejun)
Date: Fri, 20 Mar 2015 13:58:47 +0800
Subject: [Beignet] [PATCH 2/2] add utest for __gen_ocl_simd_shuffle and
	__gen_ocl_get_simd_size/id
Message-ID: <1426831127-8474-1-git-send-email-yejun.guo@intel.com>

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 kernels/compiler_simd_shuffle.cl | 15 ++++++++++++++
 utests/CMakeLists.txt            |  1 +
 utests/compiler_simd_shuffle.cpp | 44 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 kernels/compiler_simd_shuffle.cl
 create mode 100644 utests/compiler_simd_shuffle.cpp

diff --git a/kernels/compiler_simd_shuffle.cl b/kernels/compiler_simd_shuffle.cl
new file mode 100644
index 0000000..50588de
--- /dev/null
+++ b/kernels/compiler_simd_shuffle.cl
@@ -0,0 +1,15 @@
+__kernel void compiler_simd_shuffle(global int *dst, int c)
+{
+  int i = get_global_id(0);
+  if (i == 0)
+    dst[0] = __gen_ocl_get_simd_size();
+  dst++;
+
+  int from = i;
+  int o0 = __gen_ocl_get_simd_id();
+  int o1 = __gen_ocl_simd_shuffle(from, c);
+  int o2 = __gen_ocl_simd_shuffle(from, 5);
+  dst[i*3] = o0;
+  dst[i*3+1] = o1;
+  dst[i*3+2] = o2;
+}
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 06baa68..858df13 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -189,6 +189,7 @@ set (utests_sources
   compiler_getelementptr_bitcast.cpp
   compiler_simd_any.cpp
   compiler_simd_all.cpp
+  compiler_simd_shuffle.cpp
   compiler_time_stamp.cpp
   compiler_double_precision.cpp
   load_program_from_gen_bin.cpp
diff --git a/utests/compiler_simd_shuffle.cpp b/utests/compiler_simd_shuffle.cpp
new file mode 100644
index 0000000..63b9d21
--- /dev/null
+++ b/utests/compiler_simd_shuffle.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_shuffle(void)
+{
+  const size_t n = 32;
+  const int32_t buf_size = 3 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_simd_shuffle");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((int*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  int* dst = (int *)buf_data[0];
+  int simdsize = dst[0];
+  OCL_ASSERT(simdsize == 8 || simdsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / simdsize;
+    int index = i % simdsize;
+    OCL_ASSERT(index == dst[3*i]);
+    OCL_ASSERT((round * simdsize + c) == dst[3*i+1]);
+    OCL_ASSERT((round * simdsize + 5) == dst[3*i+2]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_shuffle);
-- 
1.9.1


From zhigang.gong at linux.intel.com  Fri Mar 20 02:41:27 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Fri, 20 Mar 2015 17:41:27 +0800
Subject: [Beignet] [PATCH] BUGFIX: Prohibit 'make package' from doing
 system install of ICD vendor file
In-Reply-To: <1426788474-3046-1-git-send-email-brian@arrayfire.com>
References: <1426788474-3046-1-git-send-email-brian@arrayfire.com>
Message-ID: <20150320094126.GO21732@ivb-gt2-rev4>

Thanks for the patch, could you reply with your signed-by signature
for the patch? Then I can merge it.

Thanks,
Zhigang Gong.

On Thu, Mar 19, 2015 at 02:07:54PM -0400, Brian Kloppenborg wrote:
> As presently written, a 'make package' will attempt to INSTALL the
> Beignet ICD loader to /etc/OpenCL/vendors whereas it should just
> do a local install and then package the file. The proposed change instructs
> CPack to include the `DESTDIR` variable when it calls `make install`, thus
> directing the desination for the ICD loader to a local directory instead
> of a system path.
> ---
>  CMakeLists.txt | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index e11a3d0..a230e4b 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -234,6 +234,7 @@ IF(BUILD_EXAMPLES)
>  ADD_SUBDIRECTORY(examples)
>  ENDIF(BUILD_EXAMPLES)
>  
> +SET(CPACK_SET_DESTDIR ON)
>  SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
>  SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
>  SET(CPACK_PACKAGE_VERSION_PATCH "${LIBCL_DRIVER_VERSION_PATCH}")
> -- 
> 2.1.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From brian at arrayfire.com  Fri Mar 20 06:35:13 2015
From: brian at arrayfire.com (Brian Kloppenborg)
Date: Fri, 20 Mar 2015 09:35:13 -0400
Subject: [Beignet] [PATCH] BUGFIX: Prohibit 'make package' from doing
 system install of ICD vendor file
In-Reply-To: <20150320094126.GO21732@ivb-gt2-rev4>
References: <1426788474-3046-1-git-send-email-brian@arrayfire.com>
 <20150320094126.GO21732@ivb-gt2-rev4>
Message-ID: <550C2211.4070104@arrayfire.com>


On 03/20/2015 05:41 AM, Zhigang Gong wrote:
> Thanks for the patch, could you reply with your signed-by signature
> for the patch? Then I can merge it.

I'm happy to contribute. Attached you will find my signed patch.

Kind regards,
Brian
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-BUGFIX-Prohibit-make-package-from-doing-system-insta.patch
Type: text/x-patch
Size: 1220 bytes
Desc: not available
URL: <http://lists.freedesktop.org/archives/beignet/attachments/20150320/402522e7/attachment.bin>

From david.couturier at polymtl.ca  Fri Mar 20 14:08:31 2015
From: david.couturier at polymtl.ca (David Couturier)
Date: Fri, 20 Mar 2015 17:08:31 -0400
Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
 command already marked CL_COMPLETE
In-Reply-To: <7597C9376C272A4AB2D29E91550B7B090141E537@shsmsx102.ccr.corp.intel.com>
References: <550B67A5.6070207@polymtl.ca>
 <7597C9376C272A4AB2D29E91550B7B090141E537@shsmsx102.ccr.corp.intel.com>
Message-ID: <550C8C4F.80001@polymtl.ca>

I modified the commit as suggested. Also, I noticed that the callback 
handling was not thread safe. I modified the general process to be 
thread safe.

# PATCH BEGINS HERE:

When trying to register a callback on the clEnqueueReadBuffer command, 
since it is processed
synchroniously all the time, the command was marked CL_COMPLETE every 
time. If the event returned
by clEnqueueReadBuffer was then used to register a callback function, 
the callback function did
no check to execute it if nessary.

Modified the handling of the callback registration in 
cl_set_event_callback to only call the callback being created if it's 
status is already reached.

Added thread safety measures for pfn_notify calls since the status value 
can be changed while executing the callback.

Grouped the pfn_notify calls to a unified function 
cl_event_call_callback that handles thread safety: it queues callbacks 
in a node list while under the protection of pthread_mutex and then 
calls the callbacks outside of the pthread_mutex (this is required 
because the callback can deadlock if it calls a cl_api function that 
uses the mutex)

Signed-off-by: David Couturier <david.couturier at polymtl.ca>
---
  src/cl_event.c | 77 
++++++++++++++++++++++++++++++++++++++++++----------------
  src/cl_event.h |  4 ++-
  2 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/cl_event.c b/src/cl_event.c
index f70e531..eb5d54b 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
      event->queue->last_event = NULL;

    /* Call all user's callback if haven't execute */
-  user_callback *cb = event->user_cb;
-  while(event->user_cb) {
-    cb = event->user_cb;
-    if(cb->executed == CL_FALSE) {
-      cb->executed = CL_TRUE;
-      cb->pfn_notify(event, event->status, cb->user_data);
-    }
-    event->user_cb = cb->next;
-    cl_free(cb);
-  }
+  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE 
status will force all callbacks that are not executed to run

    /* delete gpgpu event object */
    if(event->gpgpu_event)
@@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
    cb->status      = command_exec_callback_type;
    cb->executed    = CL_FALSE;

-  cb->next        = event->user_cb;
-  event->user_cb  = cb;
+
+  // It is possible that the event enqueued is already completed.
+  // clEnqueueReadBuffer can be synchronous and when the callback
+  // is registered after, it still needs to get executed.
+  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety 
required: operations on the event->status can be made from many 
different threads
+  if(event->status <= command_exec_callback_type) {
+	  /* Call user callback */
+	  pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify can 
call clFunctions that use the event_lock and from here it's not required
+	  cb->pfn_notify(event, event->status, cb->user_data);
+	  cl_free(cb);
+  } else {
+	  // Enqueue to callback list
+	  cb->next        = event->user_cb;
+	  event->user_cb  = cb;
+	  pthread_mutex_unlock(&event->ctx->event_lock);
+  }

  exit:
    return err;
@@ -388,9 +393,46 @@ error:
    goto exit;
  }

+void cl_event_call_callback(cl_event event, cl_int status, cl_bool 
free_cb) {
+	user_callback *user_cb = NULL;
+	user_callback *queue_cb = NULL; // For thread safety, we create a 
queue that holds user_callback's pfn_notify contents
+	user_callback *temp_cb = NULL;
+	user_cb = event->user_cb;
+	pthread_mutex_lock(&event->ctx->event_lock);
+	while(user_cb) {
+		if(user_cb->status >= status
+				&& user_cb->executed == CL_FALSE) { // Added check to not execute a 
callback when it was already handled
+			user_cb->executed = CL_TRUE;
+			temp_cb = cl_malloc(sizeof(user_callback));
+			if(!temp_cb) {
+				break; // Out of memory
+			}
+			temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to 
call ppfn_notify out of the pthread_mutex
+			temp_cb->user_data = user_cb->user_data;
+			if(free_cb) {
+				cl_free(user_cb);
+			}
+			if(!queue_cb) {
+				queue_cb = temp_cb;
+				queue_cb->next = NULL;
+			} else { // Enqueue
+				temp_cb->next = queue_cb;
+				queue_cb->next = temp_cb;
+			}
+		}
+		user_cb = user_cb->next;
+	}
+	pthread_mutex_unlock(&event->ctx->event_lock);
+	// Calling the callbacks outside of the event_lock is required because 
the callback can call cl_api functions and get deadlocked
+	while(queue_cb) { // For each callback queued, actually execute the 
callback
+		queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
+		temp_cb = queue_cb;
+		queue_cb = queue_cb->next;
+		cl_free(temp_cb);
+	}
+}
  void cl_event_set_status(cl_event event, cl_int status)
  {
-  user_callback *user_cb;
    cl_int ret, i;
    cl_event evt;

@@ -437,14 +479,7 @@ void cl_event_set_status(cl_event event, cl_int status)
    pthread_mutex_unlock(&event->ctx->event_lock);

    /* Call user callback */
-  user_cb = event->user_cb;
-  while(user_cb) {
-    if(user_cb->status >= status) {
-      user_cb->executed = CL_TRUE;
-      user_cb->pfn_notify(event, event->status, user_cb->user_data);
-    }
-    user_cb = user_cb->next;
-  }
+  cl_event_call_callback(event, status, CL_FALSE);

    if(event->type == CL_COMMAND_USER) {
      /* Check all defer enqueue */
diff --git a/src/cl_event.h b/src/cl_event.h
index 0730530..9bb2ac8 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context, cl_command_queue, 
cl_command_type, cl_bool);
  void cl_event_delete(cl_event);
  /* Add one more reference to this object */
  void cl_event_add_ref(cl_event);
-/* Rigister a user callback function for specific commond execution 
status */
+/* Register a user callback function for specific commond execution 
status */
  cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Execute the event's callback if the event's status supersedes the 
callback's status. Free the callback if specified */
+void cl_event_call_callback(cl_event event, cl_int status, cl_bool 
free_cb);
  /* Check events wait list for enqueue commonds */
  cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, 
cl_context);
  /* Wait the all events in wait list complete */
-- 
1.9.1

> One comment. Thanks find and fix it.
>
>> -----Original Message-----
>> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
>> David Couturier
>> Sent: Friday, March 20, 2015 08:20
>> To: Zou, Nanhai
>> Cc: beignet at lists.freedesktop.org
>> Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
>> command already marked CL_COMPLETE
>>
>> When trying to register a callback on the clEnqueueReadBuffer command,
>> since it is processed synchroniously all the time, the command was marked
>> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
>> was then used to register a callback function, the callback function did no
>> check to execute it if nessary.
>>
>> Fixed by adding a check at the end of the cl_event_set_callback function.
>>
>> All tests passed.
>>
>> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
>> ---
>>    src/cl_event.c | 15 +++++++++++++++
>>    1 file changed, 15 insertions(+)
>>
>> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..df4a5a5 100644
>> --- a/src/cl_event.c
>> +++ b/src/cl_event.c
>> @@ -183,6 +183,21 @@ cl_int cl_event_set_callback(cl_event event ,
>>      cb->next        = event->user_cb;
>>      event->user_cb  = cb;
>>
>> +  // It is possible that the event enqueued is already completed.
>> +  // clEnqueueReadBuffer can be synchronious and when the callback  //
>> + is registered after, it still needs to get executed.
>> +  if(event->status == CL_COMPLETE) {
>> +         /* Call user callback */
>> +         user_callback *user_cb = event->user_cb;
>> +         while(user_cb) {
>> +                 if(user_cb->status >= CL_COMPLETE) {
>> +                         user_cb->executed = CL_TRUE;
>> +                         user_cb->pfn_notify(event, event->status,
>> user_cb->user_data);
>> +                 }
>> +                 user_cb = user_cb->next;
>> +         }
>
> I think only the current callback should be called. Assume the scenario:
> clEnqueueReadBuffer(......,ev);
> clSetEventCallback(ev, CL_SUBMITTED, ...);
> clSetEventCallback(ev, CL_COMPLETE, ....);
> In the second clSetEventCallback, the first callback have been executed, only need execute the second callback.
> So need execute current callback when the event's status <= command_exec_callback_type.
>
>> +  }
>> +
>>    exit:
>>      return err;
>>    error:
>> --
>> 1.9.1
>> _______________________________________________
>> Beignet mailing list
>> Beignet at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
>

From rebecca_palmer at zoho.com  Sat Mar 21 00:34:22 2015
From: rebecca_palmer at zoho.com (Rebecca N. Palmer)
Date: Sat, 21 Mar 2015 07:34:22 +0000
Subject: [Beignet] [PATCH] Use matching versions of clang/llvm and
	libclang/libllvm
Message-ID: <550D1EFE.7030106@zoho.com>

Compile the OpenCL standard library with the same version of clang
as will compile OpenCL user code, not plain "clang" (i.e. the
system default version, which may be different).

Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>

diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index e214437..fa13f1d 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -23,13 +23,15 @@ else (LLVM_CONFIG_EXECUTABLE)
    message(FATAL_ERROR "Could NOT find LLVM executable, please add -DLLVM_INSTALL_DIR=/path/to/llvm-config/ in cmake command")
  endif (LLVM_CONFIG_EXECUTABLE)
  
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
+  OUTPUT_VARIABLE LLVM_VERSION
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2" LLVM_VERSION_NODOT ${LLVM_VERSION})
+string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1.\\2" LLVM_VERSION_NOPATCH ${LLVM_VERSION})
  if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
    SET(LLVM_FIND_VERSION_NODOT "${LLVM_FIND_VERSION_MAJOR}${LLVM_FIND_VERSION_MINOR}")
-  execute_process(
-    COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
-    OUTPUT_VARIABLE LLVM_VERSION
-  )
-  string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
    if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
      message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
    else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
@@ -42,6 +44,25 @@ if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
    endif (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
  endif (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
  
+if (LLVM_INSTALL_DIR)
+  find_program(CLANG_EXECUTABLE
+               NAMES clang-${LLVM_VERSION_NODOT} clang-${LLVM_VERSION_NOPATCH} clang
+               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+  find_program(LLVM_AS_EXECUTABLE
+               NAMES llvm-as-${LLVM_VERSION_NODOT} llvm-as-${LLVM_VERSION_NOPATCH} llvm-as
+               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+  find_program(LLVM_LINK_EXECUTABLE
+               NAMES llvm-link-${LLVM_VERSION_NODOT} llvm-link-${LLVM_VERSION_NOPATCH} llvm-link
+               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+else (LLVM_INSTALL_DIR)
+  find_program(CLANG_EXECUTABLE
+               NAMES clang-${LLVM_VERSION_NODOT} clang-${LLVM_VERSION_NOPATCH} clang)
+  find_program(LLVM_AS_EXECUTABLE
+               NAMES llvm-as-${LLVM_VERSION_NODOT} llvm-as-${LLVM_VERSION_NOPATCH} llvm-as)
+  find_program(LLVM_LINK_EXECUTABLE
+               NAMES llvm-link-${LLVM_VERSION_NODOT} llvm-link-${LLVM_VERSION_NOPATCH} llvm-link)
+endif (LLVM_INSTALL_DIR)
+
  execute_process(
    COMMAND ${LLVM_CONFIG_EXECUTABLE} --includedir
    OUTPUT_VARIABLE LLVM_INCLUDE_DIR
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 16f00ee..6b825b0 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -136,7 +136,7 @@ MACRO(ADD_CL_TO_BC_TARGET _file)
      ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
  	COMMAND mkdir -p ${OCL_OBJECT_DIR}/
  	#COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
-	COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
+	COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
  	DEPENDS ${_file} ${OCL_HEADER_FILES}
  	COMMENT "Compiling ${_file}"
  	)
@@ -175,7 +175,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
      ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
  	COMMAND mkdir -p ${OCL_OBJECT_DIR}/
  	#COMMAND echo ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
-	COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
+	COMMAND ${LLVM_AS_EXECUTABLE} -o ${output_name} ${srcll_name}
  	DEPENDS ${srcll_name}
  	COMMENT "Compiling ${srcll_name}"
  	)
@@ -193,21 +193,21 @@ ENDFOREACH(f)
  ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.bc
      COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
      #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
-    COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
+    COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
      DEPENDS ${OCL_BC_FILES}
      COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc"
      )
  
  ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.local.pch
      COMMAND mkdir -p ${OCL_OBJECT_DIR}
-    COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
+    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
      DEPENDS ${OCL_HEADER_FILES}
      COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.local.pch"
      )
  
  ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.pch
      COMMAND mkdir -p ${OCL_OBJECT_DIR}
-    COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
+    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
      DEPENDS ${OCL_HEADER_FILES}
      COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.pch"
      )


From rebecca_palmer at zoho.com  Sat Mar 21 00:35:56 2015
From: rebecca_palmer at zoho.com (Rebecca N. Palmer)
Date: Sat, 21 Mar 2015 07:35:56 +0000
Subject: [Beignet] FindLLVM: allow LLVM/Clang 3.6
Message-ID: <550D1F5C.7030501@zoho.com>

As beignet now works with LLVM/Clang 3.6, accept this version
when searching for llvm-config.

Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>

diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index e214437..fa13f1d 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -8,12 +8,12 @@
  # LLVM_FOUND       - True if llvm found.
  if (LLVM_INSTALL_DIR)
    find_program(LLVM_CONFIG_EXECUTABLE
-               NAMES llvm-config-35 llvm-config-3.5 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
+               NAMES llvm-config-35 llvm-config-3.5 llvm-config-36 llvm-config-3.6 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
                 DOC "llvm-config executable"
                 PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
  else (LLVM_INSTALL_DIR)
    find_program(LLVM_CONFIG_EXECUTABLE
-               NAMES llvm-config-35 llvm-config-3.5 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
+               NAMES llvm-config-35 llvm-config-3.5 llvm-config-36 llvm-config-3.6 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
                 DOC "llvm-config executable")
  endif (LLVM_INSTALL_DIR)
  

From rebecca_palmer at zoho.com  Sat Mar 21 00:37:00 2015
From: rebecca_palmer at zoho.com (Rebecca N. Palmer)
Date: Sat, 21 Mar 2015 07:37:00 +0000
Subject: [Beignet] [PATCH] Don't crash if device inaccessible
Message-ID: <550D1F9C.3050209@zoho.com>

If /dev/dri/cardX is inaccessible, return CL_DEVICE_NOT_FOUND,
don't assert-fail.

Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>

diff --git a/src/x11/dricommon.c b/src/x11/dricommon.c
index 03f542c..16f50e4 100644
--- a/src/x11/dricommon.c
+++ b/src/x11/dricommon.c
@@ -284,7 +284,6 @@ getDRI2State(Display* dpy, int screen, char **driver_name)
      goto err_out;
  
    fd = open(device_name, O_RDWR);
-  assert(fd >= 0);
  
    if (fd < 0)
      goto err_out;


From rebecca_palmer at zoho.com  Sun Mar 22 15:18:06 2015
From: rebecca_palmer at zoho.com (Rebecca N. Palmer)
Date: Sun, 22 Mar 2015 22:18:06 +0000
Subject: [Beignet] Intermittent runtime_marker_list() test failure
Message-ID: <550F3F9E.90606@zoho.com>

runtime_marker_list()    [FAILED]
     Error: ((int*)buf_data[0])[i] == (int)value + 0x3
   at file /tmp/buildd/beignet-1.0.2/utests/runtime_marker_list.cpp, 
function runtime_marker_list, line 66

As this is only the second time I've seen this (the first was back in 
0.9.3), it appears to be rare: maybe 1 in 100.


From rong.r.yang at intel.com  Sun Mar 22 23:25:36 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Mon, 23 Mar 2015 14:25:36 +0800
Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.
Message-ID: <1427091936-23944-1-git-send-email-rong.r.yang@intel.com>

HADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1)),
and RHADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1)).

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen8_context.cpp       | 114 ++++-------------------------
 backend/src/backend/gen_insn_selection.cpp |   8 +-
 2 files changed, 20 insertions(+), 102 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 3f57cf6..b136902 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -651,58 +651,21 @@ namespace gbe
     GenRegister tmp0 = ra->genReg(insn.dst(1));
     GenRegister tmp1 = ra->genReg(insn.dst(2));
     GenRegister tmp_dst = ra->genReg(insn.dst(3));
-    int execWidth = p->curr.execWidth;
 
     /* Src0 and Src1 are always unsigned long type.*/
     GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
     dst.type = src0.type;
-    tmp0.type = tmp1.type = GEN_TYPE_UD;
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
     tmp_dst.type = GEN_TYPE_UL;
 
     GBE_ASSERT(tmp_dst.subnr == 0);
-    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
-    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
-    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr);
-    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
-
-    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
-    p->push();
-    p->curr.execWidth = 8;
-    p->ADDC(dl, s0l, s1l);
-    p->MOV(tmp0, acc0);
-    p->ADDC(dh, s0h, s1h);
-    p->MOV(tmp1, acc0);
-    p->ADDC(dh, dh, tmp0);
-    p->MOV(tmp0, acc0);
-    p->ADD(tmp1, tmp0, tmp1);
-
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1));
-      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
-    }
-    p->pop();
-
-    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
-
-    p->SHR(dst, dst, GenRegister::immud(1));
-    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
+    //hadd = (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1))
+    p->AND(tmp0, src0, GenRegister::immud(1));
+    p->AND(tmp1, src1, GenRegister::immud(1));
+    p->AND(tmp_dst, tmp0, tmp1);
+    p->SHR(tmp0, src0, GenRegister::immud(1));
+    p->SHR(tmp1, src1, GenRegister::immud(1));
+    p->ADD(dst, tmp0, tmp1);
     p->ADD(dst, dst, tmp_dst);
   }
 
@@ -714,66 +677,21 @@ namespace gbe
     GenRegister tmp0 = ra->genReg(insn.dst(1));
     GenRegister tmp1 = ra->genReg(insn.dst(2));
     GenRegister tmp_dst = ra->genReg(insn.dst(3));
-    int execWidth = p->curr.execWidth;
 
     /* Src0 and Src1 are always unsigned long type.*/
     GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
     dst.type = src0.type;
-    tmp0.type = tmp1.type = GEN_TYPE_UD;
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
     tmp_dst.type = GEN_TYPE_UL;
 
     GBE_ASSERT(tmp_dst.subnr == 0);
-    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
-    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
-    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr);
-    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
-
-    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
-    p->push();
-    p->curr.execWidth = 8;
-    p->ADDC(dl, s0l, s1l);
-    p->MOV(tmp0, acc0);
-    p->ADDC(dl, dl, GenRegister::immud(1));
-    p->MOV(tmp1, acc0);
-    p->ADD(tmp0, tmp0, tmp1);
-
-    p->ADDC(dh, s0h, s1h);
-    p->MOV(tmp1, acc0);
-    p->ADDC(dh, dh, tmp0);
-    p->MOV(tmp0, acc0);
-    p->ADD(tmp1, tmp0, tmp1);
-
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(dl, 1), GenRegister::immud(1));
-      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
-      p->ADD(GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
-
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1));
-      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
-    }
-    p->pop();
-
-    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
-
-    p->SHR(dst, dst, GenRegister::immud(1));
-    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
+    //rhadd = (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1))
+    p->AND(tmp0, src0, GenRegister::immud(1));
+    p->AND(tmp1, src1, GenRegister::immud(1));
+    p->OR(tmp_dst, tmp0, tmp1);
+    p->SHR(tmp0, src0, GenRegister::immud(1));
+    p->SHR(tmp1, src1, GenRegister::immud(1));
+    p->ADD(dst, tmp0, tmp1);
     p->ADD(dst, dst, tmp_dst);
   }
 
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index c240261..c00cda5 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2516,8 +2516,8 @@ namespace gbe
                 tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
               sel.I64HADD(dst, src0, src1, tmp, 4);
             } else {
-              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
-              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
+              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
               tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
               sel.I64HADD(dst, src0, src1, tmp, 3);
             }
@@ -2531,8 +2531,8 @@ namespace gbe
                 tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
               sel.I64RHADD(dst, src0, src1, tmp, 4);
             } else {
-              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
-              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
+              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
               tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
               sel.I64RHADD(dst, src0, src1, tmp, 3);
             }
-- 
2.1.0


From ruiling.song at intel.com  Mon Mar 23 00:44:11 2015
From: ruiling.song at intel.com (Song, Ruiling)
Date: Mon, 23 Mar 2015 07:44:11 +0000
Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.
In-Reply-To: <1427091936-23944-1-git-send-email-rong.r.yang@intel.com>
References: <1427091936-23944-1-git-send-email-rong.r.yang@intel.com>
Message-ID: <148B1B7A67D1C24B9EF0BE42EA4977062B7F95AE@SHSMSX101.ccr.corp.intel.com>

Good idea, the patch LGTM. 

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Yang Rong
> Sent: Monday, March 23, 2015 2:26 PM
> To: beignet at lists.freedesktop.org
> Cc: Yang, Rong R
> Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.
> 
> HADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1)), and
> RHADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1)).
> 
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  backend/src/backend/gen8_context.cpp       | 114
> ++++-------------------------
>  backend/src/backend/gen_insn_selection.cpp |   8 +-
>  2 files changed, 20 insertions(+), 102 deletions(-)
> 
> diff --git a/backend/src/backend/gen8_context.cpp
> b/backend/src/backend/gen8_context.cpp
> index 3f57cf6..b136902 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -651,58 +651,21 @@ namespace gbe
>      GenRegister tmp0 = ra->genReg(insn.dst(1));
>      GenRegister tmp1 = ra->genReg(insn.dst(2));
>      GenRegister tmp_dst = ra->genReg(insn.dst(3));
> -    int execWidth = p->curr.execWidth;
> 
>      /* Src0 and Src1 are always unsigned long type.*/
>      GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type ==
> GEN_TYPE_UL);
>      dst.type = src0.type;
> -    tmp0.type = tmp1.type = GEN_TYPE_UD;
> +    tmp0.type = tmp1.type = GEN_TYPE_UL;
>      tmp_dst.type = GEN_TYPE_UL;
> 
>      GBE_ASSERT(tmp_dst.subnr == 0);
> -    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr,
> tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4),
> GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth /
> 8, tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src0, GEN_TYPE_UD) :
> GenRegister::unpacked_ud(src0.nr, src0.subnr);
> -    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src0, 0, 4),
> GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
> -    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src1, GEN_TYPE_UD) :
> GenRegister::unpacked_ud(src1.nr, src1.subnr);
> -    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src1, 0, 4),
> GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
> -
> -    GenRegister acc0 = GenRegister::retype(GenRegister::acc(),
> GEN_TYPE_D);
> -    p->push();
> -    p->curr.execWidth = 8;
> -    p->ADDC(dl, s0l, s1l);
> -    p->MOV(tmp0, acc0);
> -    p->ADDC(dh, s0h, s1h);
> -    p->MOV(tmp1, acc0);
> -    p->ADDC(dh, dh, tmp0);
> -    p->MOV(tmp0, acc0);
> -    p->ADD(tmp1, tmp0, tmp1);
> -
> -    if (execWidth == 16) {
> -      p->curr.quarterControl = 1;
> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1),
> GenRegister::Qn(s1l, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1),
> GenRegister::Qn(s1h, 1));
> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1),
> GenRegister::Qn(tmp0, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1),
> GenRegister::Qn(tmp1, 1));
> -    }
> -    p->pop();
> -
> -    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD),
> GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
> -
> -    p->SHR(dst, dst, GenRegister::immud(1));
> -    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
> +    //hadd = (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1))
> +    p->AND(tmp0, src0, GenRegister::immud(1));
> +    p->AND(tmp1, src1, GenRegister::immud(1));
> +    p->AND(tmp_dst, tmp0, tmp1);
> +    p->SHR(tmp0, src0, GenRegister::immud(1));
> +    p->SHR(tmp1, src1, GenRegister::immud(1));
> +    p->ADD(dst, tmp0, tmp1);
>      p->ADD(dst, dst, tmp_dst);
>    }
> 
> @@ -714,66 +677,21 @@ namespace gbe
>      GenRegister tmp0 = ra->genReg(insn.dst(1));
>      GenRegister tmp1 = ra->genReg(insn.dst(2));
>      GenRegister tmp_dst = ra->genReg(insn.dst(3));
> -    int execWidth = p->curr.execWidth;
> 
>      /* Src0 and Src1 are always unsigned long type.*/
>      GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type ==
> GEN_TYPE_UL);
>      dst.type = src0.type;
> -    tmp0.type = tmp1.type = GEN_TYPE_UD;
> +    tmp0.type = tmp1.type = GEN_TYPE_UL;
>      tmp_dst.type = GEN_TYPE_UL;
> 
>      GBE_ASSERT(tmp_dst.subnr == 0);
> -    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr,
> tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4),
> GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth /
> 8, tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src0, GEN_TYPE_UD) :
> GenRegister::unpacked_ud(src0.nr, src0.subnr);
> -    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src0, 0, 4),
> GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
> -    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src1, GEN_TYPE_UD) :
> GenRegister::unpacked_ud(src1.nr, src1.subnr);
> -    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src1, 0, 4),
> GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
> -
> -    GenRegister acc0 = GenRegister::retype(GenRegister::acc(),
> GEN_TYPE_D);
> -    p->push();
> -    p->curr.execWidth = 8;
> -    p->ADDC(dl, s0l, s1l);
> -    p->MOV(tmp0, acc0);
> -    p->ADDC(dl, dl, GenRegister::immud(1));
> -    p->MOV(tmp1, acc0);
> -    p->ADD(tmp0, tmp0, tmp1);
> -
> -    p->ADDC(dh, s0h, s1h);
> -    p->MOV(tmp1, acc0);
> -    p->ADDC(dh, dh, tmp0);
> -    p->MOV(tmp0, acc0);
> -    p->ADD(tmp1, tmp0, tmp1);
> -
> -    if (execWidth == 16) {
> -      p->curr.quarterControl = 1;
> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1),
> GenRegister::Qn(s1l, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(dl, 1),
> GenRegister::immud(1));
> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
> -      p->ADD(GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp0, 1),
> GenRegister::Qn(tmp1, 1));
> -
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1),
> GenRegister::Qn(s1h, 1));
> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1),
> GenRegister::Qn(tmp0, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1),
> GenRegister::Qn(tmp1, 1));
> -    }
> -    p->pop();
> -
> -    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD),
> GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
> -
> -    p->SHR(dst, dst, GenRegister::immud(1));
> -    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
> +    //rhadd = (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1))
> +    p->AND(tmp0, src0, GenRegister::immud(1));
> +    p->AND(tmp1, src1, GenRegister::immud(1));
> +    p->OR(tmp_dst, tmp0, tmp1);
> +    p->SHR(tmp0, src0, GenRegister::immud(1));
> +    p->SHR(tmp1, src1, GenRegister::immud(1));
> +    p->ADD(dst, tmp0, tmp1);
>      p->ADD(dst, dst, tmp_dst);
>    }
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index c240261..c00cda5 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2516,8 +2516,8 @@ namespace gbe
>                  tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>                sel.I64HADD(dst, src0, src1, tmp, 4);
>              } else {
> -              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U64);
> -              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U64);
> +              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD),
> ir::TYPE_U64);
> +              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD),
> ir::TYPE_U64);
>                tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD),
> ir::TYPE_U64);
>                sel.I64HADD(dst, src0, src1, tmp, 3);
>              }
> @@ -2531,8 +2531,8 @@ namespace gbe
>                  tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>                sel.I64RHADD(dst, src0, src1, tmp, 4);
>              } else {
> -              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U64);
> -              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U64);
> +              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD),
> ir::TYPE_U64);
> +              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD),
> ir::TYPE_U64);
>                tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD),
> ir::TYPE_U64);
>                sel.I64RHADD(dst, src0, src1, tmp, 3);
>              }
> --
> 2.1.0
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From Junyan.he at inbox.com  Mon Mar 23 01:03:59 2015
From: Junyan.he at inbox.com (He Junyan)
Date: Mon, 23 Mar 2015 16:03:59 +0800
Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.
In-Reply-To: <148B1B7A67D1C24B9EF0BE42EA4977062B7F95AE@SHSMSX101.ccr.corp.intel.com>
References: <1427091936-23944-1-git-send-email-rong.r.yang@intel.com>
 <148B1B7A67D1C24B9EF0BE42EA4977062B7F95AE@SHSMSX101.ccr.corp.intel.com>
Message-ID: <550FC8EF.7020301@inbox.com>

OK, it's a better way to avid the usage of addc.
I think tmp_dst can also be avoided here to save
one tmp register.


On 2015年03月23日 15:44, Song, Ruiling wrote:
> Good idea, the patch LGTM.
>
>> -----Original Message-----
>> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
>> Yang Rong
>> Sent: Monday, March 23, 2015 2:26 PM
>> To: beignet at lists.freedesktop.org
>> Cc: Yang, Rong R
>> Subject: [Beignet] [PATCH] BDW: Refine I64HADD and I64RHADD.
>>
>> HADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1)), and
>> RHADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1)).
>>
>> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
>> ---
>>   backend/src/backend/gen8_context.cpp       | 114
>> ++++-------------------------
>>   backend/src/backend/gen_insn_selection.cpp |   8 +-
>>   2 files changed, 20 insertions(+), 102 deletions(-)
>>
>> diff --git a/backend/src/backend/gen8_context.cpp
>> b/backend/src/backend/gen8_context.cpp
>> index 3f57cf6..b136902 100644
>> --- a/backend/src/backend/gen8_context.cpp
>> +++ b/backend/src/backend/gen8_context.cpp
>> @@ -651,58 +651,21 @@ namespace gbe
>>       GenRegister tmp0 = ra->genReg(insn.dst(1));
>>       GenRegister tmp1 = ra->genReg(insn.dst(2));
>>       GenRegister tmp_dst = ra->genReg(insn.dst(3));
>> -    int execWidth = p->curr.execWidth;
>>
>>       /* Src0 and Src1 are always unsigned long type.*/
>>       GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type ==
>> GEN_TYPE_UL);
>>       dst.type = src0.type;
>> -    tmp0.type = tmp1.type = GEN_TYPE_UD;
>> +    tmp0.type = tmp1.type = GEN_TYPE_UL;
>>       tmp_dst.type = GEN_TYPE_UL;
>>
>>       GBE_ASSERT(tmp_dst.subnr == 0);
>> -    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
>> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr,
>> tmp_dst.subnr), GEN_TYPE_UD);
>> -    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4),
>> GEN_TYPE_UD) :
>> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth /
>> 8, tmp_dst.subnr), GEN_TYPE_UD);
>> -    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(src0, GEN_TYPE_UD) :
>> GenRegister::unpacked_ud(src0.nr, src0.subnr);
>> -    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(GenRegister::offset(src0, 0, 4),
>> GEN_TYPE_UD) :
>> -      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
>> -    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(src1, GEN_TYPE_UD) :
>> GenRegister::unpacked_ud(src1.nr, src1.subnr);
>> -    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(GenRegister::offset(src1, 0, 4),
>> GEN_TYPE_UD) :
>> -      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
>> -
>> -    GenRegister acc0 = GenRegister::retype(GenRegister::acc(),
>> GEN_TYPE_D);
>> -    p->push();
>> -    p->curr.execWidth = 8;
>> -    p->ADDC(dl, s0l, s1l);
>> -    p->MOV(tmp0, acc0);
>> -    p->ADDC(dh, s0h, s1h);
>> -    p->MOV(tmp1, acc0);
>> -    p->ADDC(dh, dh, tmp0);
>> -    p->MOV(tmp0, acc0);
>> -    p->ADD(tmp1, tmp0, tmp1);
>> -
>> -    if (execWidth == 16) {
>> -      p->curr.quarterControl = 1;
>> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1),
>> GenRegister::Qn(s1l, 1));
>> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
>> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1),
>> GenRegister::Qn(s1h, 1));
>> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
>> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1),
>> GenRegister::Qn(tmp0, 1));
>> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
>> -      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1),
>> GenRegister::Qn(tmp1, 1));
>> -    }
>> -    p->pop();
>> -
>> -    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD),
>> GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
>> -
>> -    p->SHR(dst, dst, GenRegister::immud(1));
>> -    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
>> +    //hadd = (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1))
>> +    p->AND(tmp0, src0, GenRegister::immud(1));
>> +    p->AND(tmp1, src1, GenRegister::immud(1));
>> +    p->AND(tmp_dst, tmp0, tmp1);
>> +    p->SHR(tmp0, src0, GenRegister::immud(1));
>> +    p->SHR(tmp1, src1, GenRegister::immud(1));
>> +    p->ADD(dst, tmp0, tmp1);
>>       p->ADD(dst, dst, tmp_dst);
>>     }
>>
>> @@ -714,66 +677,21 @@ namespace gbe
>>       GenRegister tmp0 = ra->genReg(insn.dst(1));
>>       GenRegister tmp1 = ra->genReg(insn.dst(2));
>>       GenRegister tmp_dst = ra->genReg(insn.dst(3));
>> -    int execWidth = p->curr.execWidth;
>>
>>       /* Src0 and Src1 are always unsigned long type.*/
>>       GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type ==
>> GEN_TYPE_UL);
>>       dst.type = src0.type;
>> -    tmp0.type = tmp1.type = GEN_TYPE_UD;
>> +    tmp0.type = tmp1.type = GEN_TYPE_UL;
>>       tmp_dst.type = GEN_TYPE_UL;
>>
>>       GBE_ASSERT(tmp_dst.subnr == 0);
>> -    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
>> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr,
>> tmp_dst.subnr), GEN_TYPE_UD);
>> -    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4),
>> GEN_TYPE_UD) :
>> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth /
>> 8, tmp_dst.subnr), GEN_TYPE_UD);
>> -    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(src0, GEN_TYPE_UD) :
>> GenRegister::unpacked_ud(src0.nr, src0.subnr);
>> -    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(GenRegister::offset(src0, 0, 4),
>> GEN_TYPE_UD) :
>> -      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
>> -    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(src1, GEN_TYPE_UD) :
>> GenRegister::unpacked_ud(src1.nr, src1.subnr);
>> -    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
>> -      GenRegister::retype(GenRegister::offset(src1, 0, 4),
>> GEN_TYPE_UD) :
>> -      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
>> -
>> -    GenRegister acc0 = GenRegister::retype(GenRegister::acc(),
>> GEN_TYPE_D);
>> -    p->push();
>> -    p->curr.execWidth = 8;
>> -    p->ADDC(dl, s0l, s1l);
>> -    p->MOV(tmp0, acc0);
>> -    p->ADDC(dl, dl, GenRegister::immud(1));
>> -    p->MOV(tmp1, acc0);
>> -    p->ADD(tmp0, tmp0, tmp1);
>> -
>> -    p->ADDC(dh, s0h, s1h);
>> -    p->MOV(tmp1, acc0);
>> -    p->ADDC(dh, dh, tmp0);
>> -    p->MOV(tmp0, acc0);
>> -    p->ADD(tmp1, tmp0, tmp1);
>> -
>> -    if (execWidth == 16) {
>> -      p->curr.quarterControl = 1;
>> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1),
>> GenRegister::Qn(s1l, 1));
>> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
>> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(dl, 1),
>> GenRegister::immud(1));
>> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
>> -      p->ADD(GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp0, 1),
>> GenRegister::Qn(tmp1, 1));
>> -
>> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1),
>> GenRegister::Qn(s1h, 1));
>> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
>> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1),
>> GenRegister::Qn(tmp0, 1));
>> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
>> -      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1),
>> GenRegister::Qn(tmp1, 1));
>> -    }
>> -    p->pop();
>> -
>> -    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD),
>> GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
>> -
>> -    p->SHR(dst, dst, GenRegister::immud(1));
>> -    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
>> +    //rhadd = (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1))
>> +    p->AND(tmp0, src0, GenRegister::immud(1));
>> +    p->AND(tmp1, src1, GenRegister::immud(1));
>> +    p->OR(tmp_dst, tmp0, tmp1);
>> +    p->SHR(tmp0, src0, GenRegister::immud(1));
>> +    p->SHR(tmp1, src1, GenRegister::immud(1));
>> +    p->ADD(dst, tmp0, tmp1);
>>       p->ADD(dst, dst, tmp_dst);
>>     }
>>
>> diff --git a/backend/src/backend/gen_insn_selection.cpp
>> b/backend/src/backend/gen_insn_selection.cpp
>> index c240261..c00cda5 100644
>> --- a/backend/src/backend/gen_insn_selection.cpp
>> +++ b/backend/src/backend/gen_insn_selection.cpp
>> @@ -2516,8 +2516,8 @@ namespace gbe
>>                   tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>>                 sel.I64HADD(dst, src0, src1, tmp, 4);
>>               } else {
>> -              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD),
>> ir::TYPE_U64);
>> -              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD),
>> ir::TYPE_U64);
>> +              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD),
>> ir::TYPE_U64);
>> +              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD),
>> ir::TYPE_U64);
>>                 tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD),
>> ir::TYPE_U64);
>>                 sel.I64HADD(dst, src0, src1, tmp, 3);
>>               }
>> @@ -2531,8 +2531,8 @@ namespace gbe
>>                   tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>>                 sel.I64RHADD(dst, src0, src1, tmp, 4);
>>               } else {
>> -              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD),
>> ir::TYPE_U64);
>> -              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD),
>> ir::TYPE_U64);
>> +              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD),
>> ir::TYPE_U64);
>> +              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD),
>> ir::TYPE_U64);
>>                 tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD),
>> ir::TYPE_U64);
>>                 sel.I64RHADD(dst, src0, src1, tmp, 3);
>>               }
>> --
>> 2.1.0
>>
>> _______________________________________________
>> Beignet mailing list
>> Beignet at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet


From junyan.he at inbox.com  Mon Mar 23 02:02:43 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Mon, 23 Mar 2015 17:02:43 +0800
Subject: [Beignet] =?utf-8?q?=5BPATCH_OpenCL_2=2E0=5D_libocl=3A_Add_the_mo?=
	=?utf-8?q?dule_for_work=5Fgroup_functions=2E?=
Message-ID: <1427101363-25623-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/libocl/CMakeLists.txt           |    2 +-
 backend/src/libocl/include/ocl_work_group.h |  118 +++++++++++++++++++++++++
 backend/src/libocl/src/ocl_work_group.cl    |  126 +++++++++++++++++++++++++++
 3 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 backend/src/libocl/include/ocl_work_group.h
 create mode 100644 backend/src/libocl/src/ocl_work_group.cl

diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 16f00ee..4601c54 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -52,7 +52,7 @@ FOREACH(M ${OCL_COPY_HEADERS})
     COPY_THE_HEADER(${M})
 ENDFOREACH(M) 
 
-SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_misc ocl_vload ocl_geometric ocl_image)
+SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_misc ocl_vload ocl_geometric ocl_image ocl_work_group)
 FOREACH(M ${OCL_COPY_MODULES})
     COPY_THE_HEADER(${M})
     COPY_THE_SOURCE(${M})
diff --git a/backend/src/libocl/include/ocl_work_group.h b/backend/src/libocl/include/ocl_work_group.h
new file mode 100644
index 0000000..ebd264f
--- /dev/null
+++ b/backend/src/libocl/include/ocl_work_group.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_WORK_GROUP_H__
+#define __OCL_WORK_GROUP_H__
+#include "ocl_types.h"
+
+int work_group_all(int predicate);
+int work_group_any(int predicate);
+
+/* broadcast */
+OVERLOADABLE int work_group_broadcast(int a, size_t local_id);
+OVERLOADABLE uint work_group_broadcast(uint a, size_t local_id);
+OVERLOADABLE long work_group_broadcast(long a, size_t local_id);
+OVERLOADABLE ulong work_group_broadcast(ulong a, size_t local_id);
+OVERLOADABLE float work_group_broadcast(float a, size_t local_id);
+OVERLOADABLE double work_group_broadcast(double a, size_t local_id);
+
+OVERLOADABLE int work_group_broadcast(int a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE uint work_group_broadcast(uint a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE long work_group_broadcast(long a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE ulong work_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE float work_group_broadcast(float a, size_t local_id_x, size_t local_id_y);
+OVERLOADABLE double work_group_broadcast(double a, size_t local_id_x, size_t local_id_y);
+
+OVERLOADABLE int work_group_broadcast(int a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE uint work_group_broadcast(uint a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE long work_group_broadcast(long a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE ulong work_group_broadcast(ulong a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE float work_group_broadcast(float a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+OVERLOADABLE double work_group_broadcast(double a, size_t local_id_x, size_t local_id_y, size_t local_id_z);
+
+/* reduce add */
+OVERLOADABLE int work_group_reduce_add(int x);
+OVERLOADABLE uint work_group_reduce_add(uint x);
+OVERLOADABLE long work_group_reduce_add(long x);
+OVERLOADABLE ulong work_group_reduce_add(ulong x);
+OVERLOADABLE float work_group_reduce_add(float x);
+OVERLOADABLE double work_group_reduce_add(double x);
+
+/* reduce min */
+OVERLOADABLE int work_group_reduce_min(int x);
+OVERLOADABLE uint work_group_reduce_min(uint x);
+OVERLOADABLE long work_group_reduce_min(long x);
+OVERLOADABLE ulong work_group_reduce_min(ulong x);
+OVERLOADABLE float work_group_reduce_min(float x);
+OVERLOADABLE double work_group_reduce_min(double x);
+
+/* reduce max */
+OVERLOADABLE int work_group_reduce_max(int x);
+OVERLOADABLE uint work_group_reduce_max(uint x);
+OVERLOADABLE long work_group_reduce_max(long x);
+OVERLOADABLE ulong work_group_reduce_max(ulong x);
+OVERLOADABLE float work_group_reduce_max(float x);
+OVERLOADABLE double work_group_reduce_max(double x);
+
+/* scan_inclusive add */
+OVERLOADABLE int work_group_scan_inclusive_add(int x);
+OVERLOADABLE uint work_group_scan_inclusive_add(uint x);
+OVERLOADABLE long work_group_scan_inclusive_add(long x);
+OVERLOADABLE ulong work_group_scan_inclusive_add(ulong x);
+OVERLOADABLE float work_group_scan_inclusive_add(float x);
+OVERLOADABLE double work_group_scan_inclusive_add(double x);
+
+/* scan_inclusive min */
+OVERLOADABLE int work_group_scan_inclusive_min(int x);
+OVERLOADABLE uint work_group_scan_inclusive_min(uint x);
+OVERLOADABLE long work_group_scan_inclusive_min(long x);
+OVERLOADABLE ulong work_group_scan_inclusive_min(ulong x);
+OVERLOADABLE float work_group_scan_inclusive_min(float x);
+OVERLOADABLE double work_group_scan_inclusive_min(double x);
+
+/* scan_inclusive max */
+OVERLOADABLE int work_group_scan_inclusive_max(int x);
+OVERLOADABLE uint work_group_scan_inclusive_max(uint x);
+OVERLOADABLE long work_group_scan_inclusive_max(long x);
+OVERLOADABLE ulong work_group_scan_inclusive_max(ulong x);
+OVERLOADABLE float work_group_scan_inclusive_max(float x);
+OVERLOADABLE double work_group_scan_inclusive_max(double x);
+
+/* scan_exclusive add */
+OVERLOADABLE int work_group_scan_exclusive_add(int x);
+OVERLOADABLE uint work_group_scan_exclusive_add(uint x);
+OVERLOADABLE long work_group_scan_exclusive_add(long x);
+OVERLOADABLE ulong work_group_scan_exclusive_add(ulong x);
+OVERLOADABLE float work_group_scan_exclusive_add(float x);
+OVERLOADABLE double work_group_scan_exclusive_add(double x);
+
+/* scan_exclusive min */
+OVERLOADABLE int work_group_scan_exclusive_min(int x);
+OVERLOADABLE uint work_group_scan_exclusive_min(uint x);
+OVERLOADABLE long work_group_scan_exclusive_min(long x);
+OVERLOADABLE ulong work_group_scan_exclusive_min(ulong x);
+OVERLOADABLE float work_group_scan_exclusive_min(float x);
+OVERLOADABLE double work_group_scan_exclusive_min(double x);
+
+/* scan_exclusive max */
+OVERLOADABLE int work_group_scan_exclusive_max(int x);
+OVERLOADABLE uint work_group_scan_exclusive_max(uint x);
+OVERLOADABLE long work_group_scan_exclusive_max(long x);
+OVERLOADABLE ulong work_group_scan_exclusive_max(ulong x);
+OVERLOADABLE float work_group_scan_exclusive_max(float x);
+OVERLOADABLE double work_group_scan_exclusive_max(double x);
+#endif  /* __OCL_WORK_GROUP_H__ */
diff --git a/backend/src/libocl/src/ocl_work_group.cl b/backend/src/libocl/src/ocl_work_group.cl
new file mode 100644
index 0000000..065b223
--- /dev/null
+++ b/backend/src/libocl/src/ocl_work_group.cl
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_work_group.h"
+
+int __gen_ocl_work_group_all(int predicate);
+int work_group_all(int predicate) {
+  return __gen_ocl_work_group_all(predicate);
+}
+
+int __gen_ocl_work_group_any(int predicate);
+int work_group_any(int predicate) {
+  return __gen_ocl_work_group_any(predicate);
+}
+
+/* broadcast */
+#define BROADCAST_IMPL(GEN_TYPE) \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_broadcast(GEN_TYPE a, size_t local_id); \
+    OVERLOADABLE GEN_TYPE work_group_broadcast(GEN_TYPE a, size_t local_id) { \
+      return __gen_ocl_work_group_broadcast(a, local_id); \
+    } \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y); \
+    OVERLOADABLE GEN_TYPE work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y) { \
+      return __gen_ocl_work_group_broadcast(a, local_id_x, local_id_y);  \
+    } \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z); \
+    OVERLOADABLE GEN_TYPE work_group_broadcast(GEN_TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) { \
+      return __gen_ocl_work_group_broadcast(a, local_id_x, local_id_y, local_id_z); \
+    }
+
+BROADCAST_IMPL(int)
+BROADCAST_IMPL(uint)
+BROADCAST_IMPL(long)
+BROADCAST_IMPL(ulong)
+BROADCAST_IMPL(float)
+BROADCAST_IMPL(double)
+#undef BROADCAST_IMPL
+
+
+#define RANGE_OP(RANGE, OP, GEN_TYPE) \
+    OVERLOADABLE GEN_TYPE __gen_ocl_work_group_##RANGE##_##OP(GEN_TYPE x); \
+    OVERLOADABLE GEN_TYPE work_group_##RANGE##_##OP(GEN_TYPE x) { \
+      return __gen_ocl_work_group_##RANGE##_##OP(x);  \
+    }
+
+/* reduce add */
+RANGE_OP(reduce, add, int)
+RANGE_OP(reduce, add, uint)
+RANGE_OP(reduce, add, long)
+RANGE_OP(reduce, add, ulong)
+RANGE_OP(reduce, add, float)
+RANGE_OP(reduce, add, double)
+/* reduce min */
+RANGE_OP(reduce, min, int)
+RANGE_OP(reduce, min, uint)
+RANGE_OP(reduce, min, long)
+RANGE_OP(reduce, min, ulong)
+RANGE_OP(reduce, min, float)
+RANGE_OP(reduce, min, double)
+/* reduce max */
+RANGE_OP(reduce, max, int)
+RANGE_OP(reduce, max, uint)
+RANGE_OP(reduce, max, long)
+RANGE_OP(reduce, max, ulong)
+RANGE_OP(reduce, max, float)
+RANGE_OP(reduce, max, double)
+
+/* scan_inclusive add */
+RANGE_OP(scan_inclusive, add, int)
+RANGE_OP(scan_inclusive, add, uint)
+RANGE_OP(scan_inclusive, add, long)
+RANGE_OP(scan_inclusive, add, ulong)
+RANGE_OP(scan_inclusive, add, float)
+RANGE_OP(scan_inclusive, add, double)
+/* scan_inclusive min */
+RANGE_OP(scan_inclusive, min, int)
+RANGE_OP(scan_inclusive, min, uint)
+RANGE_OP(scan_inclusive, min, long)
+RANGE_OP(scan_inclusive, min, ulong)
+RANGE_OP(scan_inclusive, min, float)
+RANGE_OP(scan_inclusive, min, double)
+/* scan_inclusive max */
+RANGE_OP(scan_inclusive, max, int)
+RANGE_OP(scan_inclusive, max, uint)
+RANGE_OP(scan_inclusive, max, long)
+RANGE_OP(scan_inclusive, max, ulong)
+RANGE_OP(scan_inclusive, max, float)
+RANGE_OP(scan_inclusive, max, double)
+
+/* scan_exclusive add */
+RANGE_OP(scan_exclusive, add, int)
+RANGE_OP(scan_exclusive, add, uint)
+RANGE_OP(scan_exclusive, add, long)
+RANGE_OP(scan_exclusive, add, ulong)
+RANGE_OP(scan_exclusive, add, float)
+RANGE_OP(scan_exclusive, add, double)
+/* scan_exclusive min */
+RANGE_OP(scan_exclusive, min, int)
+RANGE_OP(scan_exclusive, min, uint)
+RANGE_OP(scan_exclusive, min, long)
+RANGE_OP(scan_exclusive, min, ulong)
+RANGE_OP(scan_exclusive, min, float)
+RANGE_OP(scan_exclusive, min, double)
+/* scan_exclusive max */
+RANGE_OP(scan_exclusive, max, int)
+RANGE_OP(scan_exclusive, max, uint)
+RANGE_OP(scan_exclusive, max, long)
+RANGE_OP(scan_exclusive, max, ulong)
+RANGE_OP(scan_exclusive, max, float)
+RANGE_OP(scan_exclusive, max, double)
+
+#undef RANGE_OP
-- 
1.7.9.5


From rong.r.yang at intel.com  Mon Mar 23 02:09:44 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Mon, 23 Mar 2015 17:09:44 +0800
Subject: [Beignet] [Patch V2] BDW: Refine I64HADD and I64RHADD.
Message-ID: <1427101784-3152-1-git-send-email-rong.r.yang@intel.com>

HADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1)),
and RHADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1)).

V2: Save 1 register for both, and 1 AND for I64HADD.
Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen8_context.cpp       | 123 ++++-------------------------
 backend/src/backend/gen_insn_selection.cpp |  14 ++--
 2 files changed, 23 insertions(+), 114 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 3f57cf6..3cdf62e 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -650,60 +650,19 @@ namespace gbe
     GenRegister dst = ra->genReg(insn.dst(0));
     GenRegister tmp0 = ra->genReg(insn.dst(1));
     GenRegister tmp1 = ra->genReg(insn.dst(2));
-    GenRegister tmp_dst = ra->genReg(insn.dst(3));
-    int execWidth = p->curr.execWidth;
 
     /* Src0 and Src1 are always unsigned long type.*/
     GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
     dst.type = src0.type;
-    tmp0.type = tmp1.type = GEN_TYPE_UD;
-    tmp_dst.type = GEN_TYPE_UL;
-
-    GBE_ASSERT(tmp_dst.subnr == 0);
-    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
-    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
-    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr);
-    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
-
-    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
-    p->push();
-    p->curr.execWidth = 8;
-    p->ADDC(dl, s0l, s1l);
-    p->MOV(tmp0, acc0);
-    p->ADDC(dh, s0h, s1h);
-    p->MOV(tmp1, acc0);
-    p->ADDC(dh, dh, tmp0);
-    p->MOV(tmp0, acc0);
-    p->ADD(tmp1, tmp0, tmp1);
-
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1));
-      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
-    }
-    p->pop();
-
-    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
 
-    p->SHR(dst, dst, GenRegister::immud(1));
-    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
-    p->ADD(dst, dst, tmp_dst);
+    //hadd = (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1))
+    p->AND(tmp0, src0, GenRegister::immud(1));
+    p->AND(dst, src1, tmp0);
+    p->SHR(tmp0, src0, GenRegister::immud(1));
+    p->SHR(tmp1, src1, GenRegister::immud(1));
+    p->ADD(dst, dst, tmp0);
+    p->ADD(dst, dst, tmp1);
   }
 
   void Gen8Context::emitI64RHADDInstruction(const SelectionInstruction &insn)
@@ -713,68 +672,20 @@ namespace gbe
     GenRegister dst = ra->genReg(insn.dst(0));
     GenRegister tmp0 = ra->genReg(insn.dst(1));
     GenRegister tmp1 = ra->genReg(insn.dst(2));
-    GenRegister tmp_dst = ra->genReg(insn.dst(3));
-    int execWidth = p->curr.execWidth;
 
     /* Src0 and Src1 are always unsigned long type.*/
     GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
     dst.type = src0.type;
-    tmp0.type = tmp1.type = GEN_TYPE_UD;
-    tmp_dst.type = GEN_TYPE_UL;
-
-    GBE_ASSERT(tmp_dst.subnr == 0);
-    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) :
-      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD);
-    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
-    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
-    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr);
-    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
-
-    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
-    p->push();
-    p->curr.execWidth = 8;
-    p->ADDC(dl, s0l, s1l);
-    p->MOV(tmp0, acc0);
-    p->ADDC(dl, dl, GenRegister::immud(1));
-    p->MOV(tmp1, acc0);
-    p->ADD(tmp0, tmp0, tmp1);
-
-    p->ADDC(dh, s0h, s1h);
-    p->MOV(tmp1, acc0);
-    p->ADDC(dh, dh, tmp0);
-    p->MOV(tmp0, acc0);
-    p->ADD(tmp1, tmp0, tmp1);
-
-    if (execWidth == 16) {
-      p->curr.quarterControl = 1;
-      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(dl, 1), GenRegister::immud(1));
-      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
-      p->ADD(GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
-
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1));
-      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
-      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1));
-      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
-      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
-    }
-    p->pop();
-
-    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
+    tmp0.type = tmp1.type = GEN_TYPE_UL;
 
-    p->SHR(dst, dst, GenRegister::immud(1));
-    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
-    p->ADD(dst, dst, tmp_dst);
+    //rhadd = (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1))
+    p->AND(tmp0, src0, GenRegister::immud(1));
+    p->AND(tmp1, src1, GenRegister::immud(1));
+    p->OR(dst, tmp0, tmp1);
+    p->SHR(tmp0, src0, GenRegister::immud(1));
+    p->SHR(tmp1, src1, GenRegister::immud(1));
+    p->ADD(dst, dst, tmp0);
+    p->ADD(dst, dst, tmp1);
   }
 
   void Gen8Context::emitI64DIVREMInstruction(const SelectionInstruction &cnst_insn)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index c240261..7f9c95a 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2516,10 +2516,9 @@ namespace gbe
                 tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
               sel.I64HADD(dst, src0, src1, tmp, 4);
             } else {
-              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
-              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
-              tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
-              sel.I64HADD(dst, src0, src1, tmp, 3);
+              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              sel.I64HADD(dst, src0, src1, tmp, 2);
             }
             break;
           }
@@ -2531,10 +2530,9 @@ namespace gbe
                 tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
               sel.I64RHADD(dst, src0, src1, tmp, 4);
             } else {
-              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
-              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
-              tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
-              sel.I64RHADD(dst, src0, src1, tmp, 3);
+              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
+              sel.I64RHADD(dst, src0, src1, tmp, 2);
             }
             break;
           }
-- 
2.1.0


From Junyan.he at inbox.com  Mon Mar 23 02:46:34 2015
From: Junyan.he at inbox.com (He Junyan)
Date: Mon, 23 Mar 2015 17:46:34 +0800
Subject: [Beignet] [Patch V2] BDW: Refine I64HADD and I64RHADD.
In-Reply-To: <1427101784-3152-1-git-send-email-rong.r.yang@intel.com>
References: <1427101784-3152-1-git-send-email-rong.r.yang@intel.com>
Message-ID: <550FE0FA.4020301@inbox.com>

That's OK


On 2015年03月23日 17:09, Yang Rong wrote:
> HADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1)),
> and RHADD is equal to (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1)).
>
> V2: Save 1 register for both, and 1 AND for I64HADD.
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>   backend/src/backend/gen8_context.cpp       | 123 ++++-------------------------
>   backend/src/backend/gen_insn_selection.cpp |  14 ++--
>   2 files changed, 23 insertions(+), 114 deletions(-)
>
> diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
> index 3f57cf6..3cdf62e 100644
> --- a/backend/src/backend/gen8_context.cpp
> +++ b/backend/src/backend/gen8_context.cpp
> @@ -650,60 +650,19 @@ namespace gbe
>       GenRegister dst = ra->genReg(insn.dst(0));
>       GenRegister tmp0 = ra->genReg(insn.dst(1));
>       GenRegister tmp1 = ra->genReg(insn.dst(2));
> -    GenRegister tmp_dst = ra->genReg(insn.dst(3));
> -    int execWidth = p->curr.execWidth;
>   
>       /* Src0 and Src1 are always unsigned long type.*/
>       GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
>       dst.type = src0.type;
> -    tmp0.type = tmp1.type = GEN_TYPE_UD;
> -    tmp_dst.type = GEN_TYPE_UL;
> -
> -    GBE_ASSERT(tmp_dst.subnr == 0);
> -    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
> -    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
> -    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr);
> -    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
> -
> -    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
> -    p->push();
> -    p->curr.execWidth = 8;
> -    p->ADDC(dl, s0l, s1l);
> -    p->MOV(tmp0, acc0);
> -    p->ADDC(dh, s0h, s1h);
> -    p->MOV(tmp1, acc0);
> -    p->ADDC(dh, dh, tmp0);
> -    p->MOV(tmp0, acc0);
> -    p->ADD(tmp1, tmp0, tmp1);
> -
> -    if (execWidth == 16) {
> -      p->curr.quarterControl = 1;
> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1));
> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
> -    }
> -    p->pop();
> -
> -    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
> +    tmp0.type = tmp1.type = GEN_TYPE_UL;
>   
> -    p->SHR(dst, dst, GenRegister::immud(1));
> -    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
> -    p->ADD(dst, dst, tmp_dst);
> +    //hadd = (src0>>1) + (src1>>1) + ((src0&0x1) & (src1&0x1))
> +    p->AND(tmp0, src0, GenRegister::immud(1));
> +    p->AND(dst, src1, tmp0);
> +    p->SHR(tmp0, src0, GenRegister::immud(1));
> +    p->SHR(tmp1, src1, GenRegister::immud(1));
> +    p->ADD(dst, dst, tmp0);
> +    p->ADD(dst, dst, tmp1);
>     }
>   
>     void Gen8Context::emitI64RHADDInstruction(const SelectionInstruction &insn)
> @@ -713,68 +672,20 @@ namespace gbe
>       GenRegister dst = ra->genReg(insn.dst(0));
>       GenRegister tmp0 = ra->genReg(insn.dst(1));
>       GenRegister tmp1 = ra->genReg(insn.dst(2));
> -    GenRegister tmp_dst = ra->genReg(insn.dst(3));
> -    int execWidth = p->curr.execWidth;
>   
>       /* Src0 and Src1 are always unsigned long type.*/
>       GBE_ASSERT(src0.type == GEN_TYPE_UL && src1.type == GEN_TYPE_UL);
>       dst.type = src0.type;
> -    tmp0.type = tmp1.type = GEN_TYPE_UD;
> -    tmp_dst.type = GEN_TYPE_UL;
> -
> -    GBE_ASSERT(tmp_dst.subnr == 0);
> -    GenRegister dl = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp_dst, GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr, tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister dh = tmp_dst.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(tmp_dst, 0, 4), GEN_TYPE_UD) :
> -      GenRegister::retype(GenRegister::ud16grf(tmp_dst.nr + execWidth / 8, tmp_dst.subnr), GEN_TYPE_UD);
> -    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
> -    GenRegister s0h = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src0, 0, 4), GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src0.nr, src0.subnr + 1);
> -    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(src1, GEN_TYPE_UD) : GenRegister::unpacked_ud(src1.nr, src1.subnr);
> -    GenRegister s1h = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
> -      GenRegister::retype(GenRegister::offset(src1, 0, 4), GEN_TYPE_UD) :
> -      GenRegister::unpacked_ud(src1.nr, src1.subnr + 1);
> -
> -    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
> -    p->push();
> -    p->curr.execWidth = 8;
> -    p->ADDC(dl, s0l, s1l);
> -    p->MOV(tmp0, acc0);
> -    p->ADDC(dl, dl, GenRegister::immud(1));
> -    p->MOV(tmp1, acc0);
> -    p->ADD(tmp0, tmp0, tmp1);
> -
> -    p->ADDC(dh, s0h, s1h);
> -    p->MOV(tmp1, acc0);
> -    p->ADDC(dh, dh, tmp0);
> -    p->MOV(tmp0, acc0);
> -    p->ADD(tmp1, tmp0, tmp1);
> -
> -    if (execWidth == 16) {
> -      p->curr.quarterControl = 1;
> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(s0l, 1), GenRegister::Qn(s1l, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dl, 1), GenRegister::Qn(dl, 1), GenRegister::immud(1));
> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
> -      p->ADD(GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
> -
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(s0h, 1), GenRegister::Qn(s1h, 1));
> -      p->MOV(GenRegister::Qn(tmp1, 1), acc0);
> -      p->ADDC(GenRegister::Qn(dh, 1), GenRegister::Qn(dh, 1), GenRegister::Qn(tmp0, 1));
> -      p->MOV(GenRegister::Qn(tmp0, 1), acc0);
> -      p->ADD(GenRegister::Qn(tmp1, 1), GenRegister::Qn(tmp0, 1), GenRegister::Qn(tmp1, 1));
> -    }
> -    p->pop();
> -
> -    packLongVec(GenRegister::retype(tmp_dst, GEN_TYPE_UD), GenRegister::retype(dst, GEN_TYPE_UD), execWidth);
> +    tmp0.type = tmp1.type = GEN_TYPE_UL;
>   
> -    p->SHR(dst, dst, GenRegister::immud(1));
> -    p->SHL(tmp_dst, tmp1, GenRegister::immud(63));
> -    p->ADD(dst, dst, tmp_dst);
> +    //rhadd = (src0>>1) + (src1>>1) + ((src0&0x1) | (src1&0x1))
> +    p->AND(tmp0, src0, GenRegister::immud(1));
> +    p->AND(tmp1, src1, GenRegister::immud(1));
> +    p->OR(dst, tmp0, tmp1);
> +    p->SHR(tmp0, src0, GenRegister::immud(1));
> +    p->SHR(tmp1, src1, GenRegister::immud(1));
> +    p->ADD(dst, dst, tmp0);
> +    p->ADD(dst, dst, tmp1);
>     }
>   
>     void Gen8Context::emitI64DIVREMInstruction(const SelectionInstruction &cnst_insn)
> diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
> index c240261..7f9c95a 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2516,10 +2516,9 @@ namespace gbe
>                   tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>                 sel.I64HADD(dst, src0, src1, tmp, 4);
>               } else {
> -              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
> -              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
> -              tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
> -              sel.I64HADD(dst, src0, src1, tmp, 3);
> +              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
> +              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
> +              sel.I64HADD(dst, src0, src1, tmp, 2);
>               }
>               break;
>             }
> @@ -2531,10 +2530,9 @@ namespace gbe
>                   tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
>                 sel.I64RHADD(dst, src0, src1, tmp, 4);
>               } else {
> -              tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
> -              tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U64);
> -              tmp[2] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
> -              sel.I64RHADD(dst, src0, src1, tmp, 3);
> +              tmp[0] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
> +              tmp[1] = sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64);
> +              sel.I64RHADD(dst, src0, src1, tmp, 2);
>               }
>               break;
>             }


From rong.r.yang at intel.com  Mon Mar 23 21:58:10 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Tue, 24 Mar 2015 04:58:10 +0000
Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
 command already marked CL_COMPLETE
In-Reply-To: <550C8C4F.80001@polymtl.ca>
References: <550B67A5.6070207@polymtl.ca>
 <7597C9376C272A4AB2D29E91550B7B090141E537@shsmsx102.ccr.corp.intel.com>
 <550C8C4F.80001@polymtl.ca>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090141E840@shsmsx102.ccr.corp.intel.com>

The patch looks good to me. Can you send this patch individually? 
Thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> David Couturier
> Sent: Saturday, March 21, 2015 05:09
> To: beignet at lists.freedesktop.org
> Subject: Re: [Beignet] [PATCH] Fix: Event callback that not executed when
> command already marked CL_COMPLETE
> 
> I modified the commit as suggested. Also, I noticed that the callback handling
> was not thread safe. I modified the general process to be thread safe.
> 
> # PATCH BEGINS HERE:
> 
> When trying to register a callback on the clEnqueueReadBuffer command,
> since it is processed synchroniously all the time, the command was marked
> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> was then used to register a callback function, the callback function did no
> check to execute it if nessary.
> 
> Modified the handling of the callback registration in cl_set_event_callback to
> only call the callback being created if it's status is already reached.
> 
> Added thread safety measures for pfn_notify calls since the status value can
> be changed while executing the callback.
> 
> Grouped the pfn_notify calls to a unified function cl_event_call_callback that
> handles thread safety: it queues callbacks in a node list while under the
> protection of pthread_mutex and then calls the callbacks outside of the
> pthread_mutex (this is required because the callback can deadlock if it calls a
> cl_api function that uses the mutex)
> 
> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> ---
>   src/cl_event.c | 77
> ++++++++++++++++++++++++++++++++++++++++++----------------
>   src/cl_event.h |  4 ++-
>   2 files changed, 59 insertions(+), 22 deletions(-)
> 
> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..eb5d54b 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
>       event->queue->last_event = NULL;
> 
>     /* Call all user's callback if haven't execute */
> -  user_callback *cb = event->user_cb;
> -  while(event->user_cb) {
> -    cb = event->user_cb;
> -    if(cb->executed == CL_FALSE) {
> -      cb->executed = CL_TRUE;
> -      cb->pfn_notify(event, event->status, cb->user_data);
> -    }
> -    event->user_cb = cb->next;
> -    cl_free(cb);
> -  }
> +  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE
> status will force all callbacks that are not executed to run
> 
>     /* delete gpgpu event object */
>     if(event->gpgpu_event)
> @@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
>     cb->status      = command_exec_callback_type;
>     cb->executed    = CL_FALSE;
> 
> -  cb->next        = event->user_cb;
> -  event->user_cb  = cb;
> +
> +  // It is possible that the event enqueued is already completed.
> +  // clEnqueueReadBuffer can be synchronous and when the callback
> +  // is registered after, it still needs to get executed.
> +  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety
> required: operations on the event->status can be made from many
> different threads
> +  if(event->status <= command_exec_callback_type) {
> +	  /* Call user callback */
> +	  pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify
> can
> call clFunctions that use the event_lock and from here it's not required
> +	  cb->pfn_notify(event, event->status, cb->user_data);
> +	  cl_free(cb);
> +  } else {
> +	  // Enqueue to callback list
> +	  cb->next        = event->user_cb;
> +	  event->user_cb  = cb;
> +	  pthread_mutex_unlock(&event->ctx->event_lock);
> +  }
> 
>   exit:
>     return err;
> @@ -388,9 +393,46 @@ error:
>     goto exit;
>   }
> 
> +void cl_event_call_callback(cl_event event, cl_int status, cl_bool
> free_cb) {
> +	user_callback *user_cb = NULL;
> +	user_callback *queue_cb = NULL; // For thread safety, we create a
> queue that holds user_callback's pfn_notify contents
> +	user_callback *temp_cb = NULL;
> +	user_cb = event->user_cb;
> +	pthread_mutex_lock(&event->ctx->event_lock);
> +	while(user_cb) {
> +		if(user_cb->status >= status
> +				&& user_cb->executed == CL_FALSE) { //
> Added check to not execute a
> callback when it was already handled
> +			user_cb->executed = CL_TRUE;
> +			temp_cb = cl_malloc(sizeof(user_callback));
> +			if(!temp_cb) {
> +				break; // Out of memory
> +			}
> +			temp_cb->pfn_notify = user_cb->pfn_notify; //
> Minor struct copy to
> call ppfn_notify out of the pthread_mutex
> +			temp_cb->user_data = user_cb->user_data;
> +			if(free_cb) {
> +				cl_free(user_cb);
> +			}
> +			if(!queue_cb) {
> +				queue_cb = temp_cb;
> +				queue_cb->next = NULL;
> +			} else { // Enqueue
> +				temp_cb->next = queue_cb;
> +				queue_cb->next = temp_cb;
> +			}
> +		}
> +		user_cb = user_cb->next;
> +	}
> +	pthread_mutex_unlock(&event->ctx->event_lock);
> +	// Calling the callbacks outside of the event_lock is required because
> the callback can call cl_api functions and get deadlocked
> +	while(queue_cb) { // For each callback queued, actually execute the
> callback
> +		queue_cb->pfn_notify(event, event->status, queue_cb-
> >user_data);
> +		temp_cb = queue_cb;
> +		queue_cb = queue_cb->next;
> +		cl_free(temp_cb);
> +	}
> +}
>   void cl_event_set_status(cl_event event, cl_int status)
>   {
> -  user_callback *user_cb;
>     cl_int ret, i;
>     cl_event evt;
> 
> @@ -437,14 +479,7 @@ void cl_event_set_status(cl_event event, cl_int
> status)
>     pthread_mutex_unlock(&event->ctx->event_lock);
> 
>     /* Call user callback */
> -  user_cb = event->user_cb;
> -  while(user_cb) {
> -    if(user_cb->status >= status) {
> -      user_cb->executed = CL_TRUE;
> -      user_cb->pfn_notify(event, event->status, user_cb->user_data);
> -    }
> -    user_cb = user_cb->next;
> -  }
> +  cl_event_call_callback(event, status, CL_FALSE);
> 
>     if(event->type == CL_COMMAND_USER) {
>       /* Check all defer enqueue */
> diff --git a/src/cl_event.h b/src/cl_event.h
> index 0730530..9bb2ac8 100644
> --- a/src/cl_event.h
> +++ b/src/cl_event.h
> @@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context,
> cl_command_queue,
> cl_command_type, cl_bool);
>   void cl_event_delete(cl_event);
>   /* Add one more reference to this object */
>   void cl_event_add_ref(cl_event);
> -/* Rigister a user callback function for specific commond execution
> status */
> +/* Register a user callback function for specific commond execution
> status */
>   cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
> +/* Execute the event's callback if the event's status supersedes the
> callback's status. Free the callback if specified */
> +void cl_event_call_callback(cl_event event, cl_int status, cl_bool
> free_cb);
>   /* Check events wait list for enqueue commonds */
>   cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *,
> cl_context);
>   /* Wait the all events in wait list complete */
> --
> 1.9.1
> 
> > One comment. Thanks find and fix it.
> >
> >> -----Original Message-----
> >> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf
> Of
> >> David Couturier
> >> Sent: Friday, March 20, 2015 08:20
> >> To: Zou, Nanhai
> >> Cc: beignet at lists.freedesktop.org
> >> Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
> >> command already marked CL_COMPLETE
> >>
> >> When trying to register a callback on the clEnqueueReadBuffer command,
> >> since it is processed synchroniously all the time, the command was
> marked
> >> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> >> was then used to register a callback function, the callback function did no
> >> check to execute it if nessary.
> >>
> >> Fixed by adding a check at the end of the cl_event_set_callback function.
> >>
> >> All tests passed.
> >>
> >> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> >> ---
> >>    src/cl_event.c | 15 +++++++++++++++
> >>    1 file changed, 15 insertions(+)
> >>
> >> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..df4a5a5 100644
> >> --- a/src/cl_event.c
> >> +++ b/src/cl_event.c
> >> @@ -183,6 +183,21 @@ cl_int cl_event_set_callback(cl_event event ,
> >>      cb->next        = event->user_cb;
> >>      event->user_cb  = cb;
> >>
> >> +  // It is possible that the event enqueued is already completed.
> >> +  // clEnqueueReadBuffer can be synchronious and when the callback  //
> >> + is registered after, it still needs to get executed.
> >> +  if(event->status == CL_COMPLETE) {
> >> +         /* Call user callback */
> >> +         user_callback *user_cb = event->user_cb;
> >> +         while(user_cb) {
> >> +                 if(user_cb->status >= CL_COMPLETE) {
> >> +                         user_cb->executed = CL_TRUE;
> >> +                         user_cb->pfn_notify(event, event->status,
> >> user_cb->user_data);
> >> +                 }
> >> +                 user_cb = user_cb->next;
> >> +         }
> >
> > I think only the current callback should be called. Assume the scenario:
> > clEnqueueReadBuffer(......,ev);
> > clSetEventCallback(ev, CL_SUBMITTED, ...);
> > clSetEventCallback(ev, CL_COMPLETE, ....);
> > In the second clSetEventCallback, the first callback have been executed,
> only need execute the second callback.
> > So need execute current callback when the event's status <=
> command_exec_callback_type.
> >
> >> +  }
> >> +
> >>    exit:
> >>      return err;
> >>    error:
> >> --
> >> 1.9.1
> >> _______________________________________________
> >> Beignet mailing list
> >> Beignet at lists.freedesktop.org
> >> http://lists.freedesktop.org/mailman/listinfo/beignet
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
> >
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Mon Mar 23 21:14:23 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 24 Mar 2015 12:14:23 +0800
Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
 command already marked CL_COMPLETE
In-Reply-To: <7597C9376C272A4AB2D29E91550B7B090141E840@shsmsx102.ccr.corp.intel.com>
References: <550B67A5.6070207@polymtl.ca>
 <7597C9376C272A4AB2D29E91550B7B090141E537@shsmsx102.ccr.corp.intel.com>
 <550C8C4F.80001@polymtl.ca>
 <7597C9376C272A4AB2D29E91550B7B090141E840@shsmsx102.ccr.corp.intel.com>
Message-ID: <20150324041422.GP21732@ivb-gt2-rev4>

Right, the patch embedded in previous email has format problem.
Could you use "git send-email" to send the patch again?

Thanks for your contribution.

On Tue, Mar 24, 2015 at 04:58:10AM +0000, Yang, Rong R wrote:
> The patch looks good to me. Can you send this patch individually? 
> Thanks.
> 
> > -----Original Message-----
> > From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> > David Couturier
> > Sent: Saturday, March 21, 2015 05:09
> > To: beignet at lists.freedesktop.org
> > Subject: Re: [Beignet] [PATCH] Fix: Event callback that not executed when
> > command already marked CL_COMPLETE
> > 
> > I modified the commit as suggested. Also, I noticed that the callback handling
> > was not thread safe. I modified the general process to be thread safe.
> > 
> > # PATCH BEGINS HERE:
> > 
> > When trying to register a callback on the clEnqueueReadBuffer command,
> > since it is processed synchroniously all the time, the command was marked
> > CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> > was then used to register a callback function, the callback function did no
> > check to execute it if nessary.
> > 
> > Modified the handling of the callback registration in cl_set_event_callback to
> > only call the callback being created if it's status is already reached.
> > 
> > Added thread safety measures for pfn_notify calls since the status value can
> > be changed while executing the callback.
> > 
> > Grouped the pfn_notify calls to a unified function cl_event_call_callback that
> > handles thread safety: it queues callbacks in a node list while under the
> > protection of pthread_mutex and then calls the callbacks outside of the
> > pthread_mutex (this is required because the callback can deadlock if it calls a
> > cl_api function that uses the mutex)
> > 
> > Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> > ---
> >   src/cl_event.c | 77
> > ++++++++++++++++++++++++++++++++++++++++++----------------
> >   src/cl_event.h |  4 ++-
> >   2 files changed, 59 insertions(+), 22 deletions(-)
> > 
> > diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..eb5d54b 100644
> > --- a/src/cl_event.c
> > +++ b/src/cl_event.c
> > @@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
> >       event->queue->last_event = NULL;
> > 
> >     /* Call all user's callback if haven't execute */
> > -  user_callback *cb = event->user_cb;
> > -  while(event->user_cb) {
> > -    cb = event->user_cb;
> > -    if(cb->executed == CL_FALSE) {
> > -      cb->executed = CL_TRUE;
> > -      cb->pfn_notify(event, event->status, cb->user_data);
> > -    }
> > -    event->user_cb = cb->next;
> > -    cl_free(cb);
> > -  }
> > +  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE
> > status will force all callbacks that are not executed to run
> > 
> >     /* delete gpgpu event object */
> >     if(event->gpgpu_event)
> > @@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
> >     cb->status      = command_exec_callback_type;
> >     cb->executed    = CL_FALSE;
> > 
> > -  cb->next        = event->user_cb;
> > -  event->user_cb  = cb;
> > +
> > +  // It is possible that the event enqueued is already completed.
> > +  // clEnqueueReadBuffer can be synchronous and when the callback
> > +  // is registered after, it still needs to get executed.
> > +  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety
> > required: operations on the event->status can be made from many
> > different threads
> > +  if(event->status <= command_exec_callback_type) {
> > +	  /* Call user callback */
> > +	  pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify
> > can
> > call clFunctions that use the event_lock and from here it's not required
> > +	  cb->pfn_notify(event, event->status, cb->user_data);
> > +	  cl_free(cb);
> > +  } else {
> > +	  // Enqueue to callback list
> > +	  cb->next        = event->user_cb;
> > +	  event->user_cb  = cb;
> > +	  pthread_mutex_unlock(&event->ctx->event_lock);
> > +  }
> > 
> >   exit:
> >     return err;
> > @@ -388,9 +393,46 @@ error:
> >     goto exit;
> >   }
> > 
> > +void cl_event_call_callback(cl_event event, cl_int status, cl_bool
> > free_cb) {
> > +	user_callback *user_cb = NULL;
> > +	user_callback *queue_cb = NULL; // For thread safety, we create a
> > queue that holds user_callback's pfn_notify contents
> > +	user_callback *temp_cb = NULL;
> > +	user_cb = event->user_cb;
> > +	pthread_mutex_lock(&event->ctx->event_lock);
> > +	while(user_cb) {
> > +		if(user_cb->status >= status
> > +				&& user_cb->executed == CL_FALSE) { //
> > Added check to not execute a
> > callback when it was already handled
> > +			user_cb->executed = CL_TRUE;
> > +			temp_cb = cl_malloc(sizeof(user_callback));
> > +			if(!temp_cb) {
> > +				break; // Out of memory
> > +			}
> > +			temp_cb->pfn_notify = user_cb->pfn_notify; //
> > Minor struct copy to
> > call ppfn_notify out of the pthread_mutex
> > +			temp_cb->user_data = user_cb->user_data;
> > +			if(free_cb) {
> > +				cl_free(user_cb);
> > +			}
> > +			if(!queue_cb) {
> > +				queue_cb = temp_cb;
> > +				queue_cb->next = NULL;
> > +			} else { // Enqueue
> > +				temp_cb->next = queue_cb;
> > +				queue_cb->next = temp_cb;
> > +			}
> > +		}
> > +		user_cb = user_cb->next;
> > +	}
> > +	pthread_mutex_unlock(&event->ctx->event_lock);
> > +	// Calling the callbacks outside of the event_lock is required because
> > the callback can call cl_api functions and get deadlocked
> > +	while(queue_cb) { // For each callback queued, actually execute the
> > callback
> > +		queue_cb->pfn_notify(event, event->status, queue_cb-
> > >user_data);
> > +		temp_cb = queue_cb;
> > +		queue_cb = queue_cb->next;
> > +		cl_free(temp_cb);
> > +	}
> > +}
> >   void cl_event_set_status(cl_event event, cl_int status)
> >   {
> > -  user_callback *user_cb;
> >     cl_int ret, i;
> >     cl_event evt;
> > 
> > @@ -437,14 +479,7 @@ void cl_event_set_status(cl_event event, cl_int
> > status)
> >     pthread_mutex_unlock(&event->ctx->event_lock);
> > 
> >     /* Call user callback */
> > -  user_cb = event->user_cb;
> > -  while(user_cb) {
> > -    if(user_cb->status >= status) {
> > -      user_cb->executed = CL_TRUE;
> > -      user_cb->pfn_notify(event, event->status, user_cb->user_data);
> > -    }
> > -    user_cb = user_cb->next;
> > -  }
> > +  cl_event_call_callback(event, status, CL_FALSE);
> > 
> >     if(event->type == CL_COMMAND_USER) {
> >       /* Check all defer enqueue */
> > diff --git a/src/cl_event.h b/src/cl_event.h
> > index 0730530..9bb2ac8 100644
> > --- a/src/cl_event.h
> > +++ b/src/cl_event.h
> > @@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context,
> > cl_command_queue,
> > cl_command_type, cl_bool);
> >   void cl_event_delete(cl_event);
> >   /* Add one more reference to this object */
> >   void cl_event_add_ref(cl_event);
> > -/* Rigister a user callback function for specific commond execution
> > status */
> > +/* Register a user callback function for specific commond execution
> > status */
> >   cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
> > +/* Execute the event's callback if the event's status supersedes the
> > callback's status. Free the callback if specified */
> > +void cl_event_call_callback(cl_event event, cl_int status, cl_bool
> > free_cb);
> >   /* Check events wait list for enqueue commonds */
> >   cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *,
> > cl_context);
> >   /* Wait the all events in wait list complete */
> > --
> > 1.9.1
> > 
> > > One comment. Thanks find and fix it.
> > >
> > >> -----Original Message-----
> > >> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf
> > Of
> > >> David Couturier
> > >> Sent: Friday, March 20, 2015 08:20
> > >> To: Zou, Nanhai
> > >> Cc: beignet at lists.freedesktop.org
> > >> Subject: [Beignet] [PATCH] Fix: Event callback that not executed when
> > >> command already marked CL_COMPLETE
> > >>
> > >> When trying to register a callback on the clEnqueueReadBuffer command,
> > >> since it is processed synchroniously all the time, the command was
> > marked
> > >> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> > >> was then used to register a callback function, the callback function did no
> > >> check to execute it if nessary.
> > >>
> > >> Fixed by adding a check at the end of the cl_event_set_callback function.
> > >>
> > >> All tests passed.
> > >>
> > >> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> > >> ---
> > >>    src/cl_event.c | 15 +++++++++++++++
> > >>    1 file changed, 15 insertions(+)
> > >>
> > >> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..df4a5a5 100644
> > >> --- a/src/cl_event.c
> > >> +++ b/src/cl_event.c
> > >> @@ -183,6 +183,21 @@ cl_int cl_event_set_callback(cl_event event ,
> > >>      cb->next        = event->user_cb;
> > >>      event->user_cb  = cb;
> > >>
> > >> +  // It is possible that the event enqueued is already completed.
> > >> +  // clEnqueueReadBuffer can be synchronious and when the callback  //
> > >> + is registered after, it still needs to get executed.
> > >> +  if(event->status == CL_COMPLETE) {
> > >> +         /* Call user callback */
> > >> +         user_callback *user_cb = event->user_cb;
> > >> +         while(user_cb) {
> > >> +                 if(user_cb->status >= CL_COMPLETE) {
> > >> +                         user_cb->executed = CL_TRUE;
> > >> +                         user_cb->pfn_notify(event, event->status,
> > >> user_cb->user_data);
> > >> +                 }
> > >> +                 user_cb = user_cb->next;
> > >> +         }
> > >
> > > I think only the current callback should be called. Assume the scenario:
> > > clEnqueueReadBuffer(......,ev);
> > > clSetEventCallback(ev, CL_SUBMITTED, ...);
> > > clSetEventCallback(ev, CL_COMPLETE, ....);
> > > In the second clSetEventCallback, the first callback have been executed,
> > only need execute the second callback.
> > > So need execute current callback when the event's status <=
> > command_exec_callback_type.
> > >
> > >> +  }
> > >> +
> > >>    exit:
> > >>      return err;
> > >>    error:
> > >> --
> > >> 1.9.1
> > >> _______________________________________________
> > >> Beignet mailing list
> > >> Beignet at lists.freedesktop.org
> > >> http://lists.freedesktop.org/mailman/listinfo/beignet
> > > _______________________________________________
> > > Beignet mailing list
> > > Beignet at lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/beignet
> > >
> > _______________________________________________
> > Beignet mailing list
> > Beignet at lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/beignet
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From david.couturier at polymtl.ca  Mon Mar 23 22:38:17 2015
From: david.couturier at polymtl.ca (David Couturier)
Date: Tue, 24 Mar 2015 01:38:17 -0400
Subject: [Beignet] [PATCH] Fix: Event callback that were not executed when
	command was already CL_COMPLETE + thread safety for callbacks
In-Reply-To: <david.couturier@polymtl.ca>
References: <david.couturier@polymtl.ca>
Message-ID: <1427175497-15874-1-git-send-email-david.couturier@polymtl.ca>

When trying to register a callback on the clEnqueueReadBuffer command, since it is processed
synchroniously all the time, the command was marked CL_COMPLETE every time. If the event returned
by clEnqueueReadBuffer was then used to register a callback function, the callback function did
no check to execute it if nessary.

Modified the handling of the callback registration in cl_set_event_callback to only call the callback being created if it's status is already reached.

Added thread safety measures for pfn_notify calls since the status value can be changed while executing the callback.

Grouped the pfn_notify calls to a unified function cl_event_call_callback that handles thread safety: it queues callbacks in a node list while under the protection of pthread_mutex and then calls the callbacks outside of the pthread_mutex (this is required because the callback can deadlock if it calls a cl_api function that uses the mutex)

Signed-off-by: David Couturier <david.couturier at polymtl.ca>
---
 src/cl_event.c | 77 ++++++++++++++++++++++++++++++++++++++++++----------------
 src/cl_event.h |  4 ++-
 2 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/cl_event.c b/src/cl_event.c
index f70e531..eb5d54b 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
     event->queue->last_event = NULL;
 
   /* Call all user's callback if haven't execute */
-  user_callback *cb = event->user_cb;
-  while(event->user_cb) {
-    cb = event->user_cb;
-    if(cb->executed == CL_FALSE) {
-      cb->executed = CL_TRUE;
-      cb->pfn_notify(event, event->status, cb->user_data);
-    }
-    event->user_cb = cb->next;
-    cl_free(cb);
-  }
+  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run
 
   /* delete gpgpu event object */
   if(event->gpgpu_event)
@@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
   cb->status      = command_exec_callback_type;
   cb->executed    = CL_FALSE;
 
-  cb->next        = event->user_cb;
-  event->user_cb  = cb;
+
+  // It is possible that the event enqueued is already completed.
+  // clEnqueueReadBuffer can be synchronous and when the callback
+  // is registered after, it still needs to get executed.
+  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety required: operations on the event->status can be made from many different threads
+  if(event->status <= command_exec_callback_type) {
+	  /* Call user callback */
+	  pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify can call clFunctions that use the event_lock and from here it's not required
+	  cb->pfn_notify(event, event->status, cb->user_data);
+	  cl_free(cb);
+  } else {
+	  // Enqueue to callback list
+	  cb->next        = event->user_cb;
+	  event->user_cb  = cb;
+	  pthread_mutex_unlock(&event->ctx->event_lock);
+  }
 
 exit:
   return err;
@@ -388,9 +393,46 @@ error:
   goto exit;
 }
 
+void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) {
+	user_callback *user_cb = NULL;
+	user_callback *queue_cb = NULL; // For thread safety, we create a queue that holds user_callback's pfn_notify contents
+	user_callback *temp_cb = NULL;
+	user_cb = event->user_cb;
+	pthread_mutex_lock(&event->ctx->event_lock);
+	while(user_cb) {
+		if(user_cb->status >= status
+				&& user_cb->executed == CL_FALSE) { // Added check to not execute a callback when it was already handled
+			user_cb->executed = CL_TRUE;
+			temp_cb = cl_malloc(sizeof(user_callback));
+			if(!temp_cb) {
+				break; // Out of memory
+			}
+			temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call ppfn_notify out of the pthread_mutex
+			temp_cb->user_data = user_cb->user_data;
+			if(free_cb) {
+				cl_free(user_cb);
+			}
+			if(!queue_cb) {
+				queue_cb = temp_cb;
+				queue_cb->next = NULL;
+			} else { // Enqueue
+				temp_cb->next = queue_cb;
+				queue_cb->next = temp_cb;
+			}
+		}
+		user_cb = user_cb->next;
+	}
+	pthread_mutex_unlock(&event->ctx->event_lock);
+	// Calling the callbacks outside of the event_lock is required because the callback can call cl_api functions and get deadlocked
+	while(queue_cb) { // For each callback queued, actually execute the callback
+		queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
+		temp_cb = queue_cb;
+		queue_cb = queue_cb->next;
+		cl_free(temp_cb);
+	}
+}
 void cl_event_set_status(cl_event event, cl_int status)
 {
-  user_callback *user_cb;
   cl_int ret, i;
   cl_event evt;
 
@@ -437,14 +479,7 @@ void cl_event_set_status(cl_event event, cl_int status)
   pthread_mutex_unlock(&event->ctx->event_lock);
 
   /* Call user callback */
-  user_cb = event->user_cb;
-  while(user_cb) {
-    if(user_cb->status >= status) {
-      user_cb->executed = CL_TRUE;
-      user_cb->pfn_notify(event, event->status, user_cb->user_data);
-    }
-    user_cb = user_cb->next;
-  }
+  cl_event_call_callback(event, status, CL_FALSE);
 
   if(event->type == CL_COMMAND_USER) {
     /* Check all defer enqueue */
diff --git a/src/cl_event.h b/src/cl_event.h
index 0730530..9bb2ac8 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
 void cl_event_delete(cl_event);
 /* Add one more reference to this object */
 void cl_event_add_ref(cl_event);
-/* Rigister a user callback function for specific commond execution status */
+/* Register a user callback function for specific commond execution status */
 cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Execute the event's callback if the event's status supersedes the callback's status. Free the callback if specified */
+void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
 /* Check events wait list for enqueue commonds */
 cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
 /* Wait the all events in wait list complete */
-- 
1.9.1


From zhigang.gong at linux.intel.com  Mon Mar 23 22:12:10 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 24 Mar 2015 13:12:10 +0800
Subject: [Beignet] [PATCH] Use matching versions of clang/llvm and
 libclang/libllvm
In-Reply-To: <550D1EFE.7030106@zoho.com>
References: <550D1EFE.7030106@zoho.com>
Message-ID: <20150324051209.GQ21732@ivb-gt2-rev4>

LGTM, will push latter. Thanks.

On Sat, Mar 21, 2015 at 07:34:22AM +0000, Rebecca N. Palmer wrote:
> Compile the OpenCL standard library with the same version of clang
> as will compile OpenCL user code, not plain "clang" (i.e. the
> system default version, which may be different).
> 
> Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
> 
> diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
> index e214437..fa13f1d 100644
> --- a/CMake/FindLLVM.cmake
> +++ b/CMake/FindLLVM.cmake
> @@ -23,13 +23,15 @@ else (LLVM_CONFIG_EXECUTABLE)
>    message(FATAL_ERROR "Could NOT find LLVM executable, please add -DLLVM_INSTALL_DIR=/path/to/llvm-config/ in cmake command")
>  endif (LLVM_CONFIG_EXECUTABLE)
> +execute_process(
> +  COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
> +  OUTPUT_VARIABLE LLVM_VERSION
> +  OUTPUT_STRIP_TRAILING_WHITESPACE
> +)
> +string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2" LLVM_VERSION_NODOT ${LLVM_VERSION})
> +string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1.\\2" LLVM_VERSION_NOPATCH ${LLVM_VERSION})
>  if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
>    SET(LLVM_FIND_VERSION_NODOT "${LLVM_FIND_VERSION_MAJOR}${LLVM_FIND_VERSION_MINOR}")
> -  execute_process(
> -    COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
> -    OUTPUT_VARIABLE LLVM_VERSION
> -  )
> -  string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
>    if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
>      message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
>    else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
> @@ -42,6 +44,25 @@ if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
>    endif (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
>  endif (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
> +if (LLVM_INSTALL_DIR)
> +  find_program(CLANG_EXECUTABLE
> +               NAMES clang-${LLVM_VERSION_NODOT} clang-${LLVM_VERSION_NOPATCH} clang
> +               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
> +  find_program(LLVM_AS_EXECUTABLE
> +               NAMES llvm-as-${LLVM_VERSION_NODOT} llvm-as-${LLVM_VERSION_NOPATCH} llvm-as
> +               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
> +  find_program(LLVM_LINK_EXECUTABLE
> +               NAMES llvm-link-${LLVM_VERSION_NODOT} llvm-link-${LLVM_VERSION_NOPATCH} llvm-link
> +               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
> +else (LLVM_INSTALL_DIR)
> +  find_program(CLANG_EXECUTABLE
> +               NAMES clang-${LLVM_VERSION_NODOT} clang-${LLVM_VERSION_NOPATCH} clang)
> +  find_program(LLVM_AS_EXECUTABLE
> +               NAMES llvm-as-${LLVM_VERSION_NODOT} llvm-as-${LLVM_VERSION_NOPATCH} llvm-as)
> +  find_program(LLVM_LINK_EXECUTABLE
> +               NAMES llvm-link-${LLVM_VERSION_NODOT} llvm-link-${LLVM_VERSION_NOPATCH} llvm-link)
> +endif (LLVM_INSTALL_DIR)
> +
>  execute_process(
>    COMMAND ${LLVM_CONFIG_EXECUTABLE} --includedir
>    OUTPUT_VARIABLE LLVM_INCLUDE_DIR
> diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
> index 16f00ee..6b825b0 100644
> --- a/backend/src/libocl/CMakeLists.txt
> +++ b/backend/src/libocl/CMakeLists.txt
> @@ -136,7 +136,7 @@ MACRO(ADD_CL_TO_BC_TARGET _file)
>      ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
>  	COMMAND mkdir -p ${OCL_OBJECT_DIR}/
>  	#COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
> -	COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
> +	COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
>  	DEPENDS ${_file} ${OCL_HEADER_FILES}
>  	COMMENT "Compiling ${_file}"
>  	)
> @@ -175,7 +175,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
>      ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
>  	COMMAND mkdir -p ${OCL_OBJECT_DIR}/
>  	#COMMAND echo ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
> -	COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
> +	COMMAND ${LLVM_AS_EXECUTABLE} -o ${output_name} ${srcll_name}
>  	DEPENDS ${srcll_name}
>  	COMMENT "Compiling ${srcll_name}"
>  	)
> @@ -193,21 +193,21 @@ ENDFOREACH(f)
>  ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.bc
>      COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
>      #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
> -    COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
> +    COMMAND ${LLVM_LINK_EXECUTABLE} -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
>      DEPENDS ${OCL_BC_FILES}
>      COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc"
>      )
>  ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.local.pch
>      COMMAND mkdir -p ${OCL_OBJECT_DIR}
> -    COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
> +    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
>      DEPENDS ${OCL_HEADER_FILES}
>      COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.local.pch"
>      )
>  ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.pch
>      COMMAND mkdir -p ${OCL_OBJECT_DIR}
> -    COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
> +    COMMAND ${CLANG_EXECUTABLE} -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
>      DEPENDS ${OCL_HEADER_FILES}
>      COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.pch"
>      )
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Mon Mar 23 22:12:39 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 24 Mar 2015 13:12:39 +0800
Subject: [Beignet] FindLLVM: allow LLVM/Clang 3.6
In-Reply-To: <550D1F5C.7030501@zoho.com>
References: <550D1F5C.7030501@zoho.com>
Message-ID: <20150324051238.GR21732@ivb-gt2-rev4>

LGTM, will push latter, thanks.

On Sat, Mar 21, 2015 at 07:35:56AM +0000, Rebecca N. Palmer wrote:
> As beignet now works with LLVM/Clang 3.6, accept this version
> when searching for llvm-config.
> 
> Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
> 
> diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
> index e214437..fa13f1d 100644
> --- a/CMake/FindLLVM.cmake
> +++ b/CMake/FindLLVM.cmake
> @@ -8,12 +8,12 @@
>  # LLVM_FOUND       - True if llvm found.
>  if (LLVM_INSTALL_DIR)
>    find_program(LLVM_CONFIG_EXECUTABLE
> -               NAMES llvm-config-35 llvm-config-3.5 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
> +               NAMES llvm-config-35 llvm-config-3.5 llvm-config-36 llvm-config-3.6 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
>                 DOC "llvm-config executable"
>                 PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
>  else (LLVM_INSTALL_DIR)
>    find_program(LLVM_CONFIG_EXECUTABLE
> -               NAMES llvm-config-35 llvm-config-3.5 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
> +               NAMES llvm-config-35 llvm-config-3.5 llvm-config-36 llvm-config-3.6 llvm-config-33 llvm-config-3.3 llvm-config-34 llvm-config-3.4 llvm-config
>                 DOC "llvm-config executable")
>  endif (LLVM_INSTALL_DIR)
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From zhigang.gong at linux.intel.com  Mon Mar 23 22:13:49 2015
From: zhigang.gong at linux.intel.com (Zhigang Gong)
Date: Tue, 24 Mar 2015 13:13:49 +0800
Subject: [Beignet] [PATCH] Don't crash if device inaccessible
In-Reply-To: <550D1F9C.3050209@zoho.com>
References: <550D1F9C.3050209@zoho.com>
Message-ID: <20150324051348.GS21732@ivb-gt2-rev4>

LGTM, will push latter, thanks.

On Sat, Mar 21, 2015 at 07:37:00AM +0000, Rebecca N. Palmer wrote:
> If /dev/dri/cardX is inaccessible, return CL_DEVICE_NOT_FOUND,
> don't assert-fail.
> 
> Signed-off-by: Rebecca Palmer <rebecca_palmer at zoho.com>
> 
> diff --git a/src/x11/dricommon.c b/src/x11/dricommon.c
> index 03f542c..16f50e4 100644
> --- a/src/x11/dricommon.c
> +++ b/src/x11/dricommon.c
> @@ -284,7 +284,6 @@ getDRI2State(Display* dpy, int screen, char **driver_name)
>      goto err_out;
>    fd = open(device_name, O_RDWR);
> -  assert(fd >= 0);
>    if (fd < 0)
>      goto err_out;
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From junyan.he at inbox.com  Mon Mar 23 23:40:01 2015
From: junyan.he at inbox.com (junyan.he at inbox.com)
Date: Tue, 24 Mar 2015 14:40:01 +0800
Subject: [Beignet] [PATCH OpenCL 2.0] Backend: Update the workgroup
	instructions for llvm backend to gen.
Message-ID: <1427179201-4467-1-git-send-email-junyan.he@inbox.com>

From: Junyan He <junyan.he at linux.intel.com>

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 backend/src/llvm/llvm_gen_ocl_function.hxx |   87 ++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 9536a3c..947fadc 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -160,3 +160,90 @@ DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
 // printf function
 DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+
+// work group function
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_1D, _Z30__gen_ocl_work_group_broadcastij)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_2D, _Z30__gen_ocl_work_group_broadcastijj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_3D, _Z30__gen_ocl_work_group_broadcastijjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_1D, _Z30__gen_ocl_work_group_broadcastjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_2D, _Z30__gen_ocl_work_group_broadcastjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_3D, _Z30__gen_ocl_work_group_broadcastjjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_1D, _Z30__gen_ocl_work_group_broadcastlj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_2D, _Z30__gen_ocl_work_group_broadcastljj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_lONG_3D, _Z30__gen_ocl_work_group_broadcastljjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_1D, _Z30__gen_ocl_work_group_broadcastmj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_2D, _Z30__gen_ocl_work_group_broadcastmjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_3D, _Z30__gen_ocl_work_group_broadcastmjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_1D, _Z30__gen_ocl_work_group_broadcastfj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_2D, _Z30__gen_ocl_work_group_broadcastfjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_3D, _Z30__gen_ocl_work_group_broadcastfjjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_1D, _Z30__gen_ocl_work_group_broadcastdj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_2D, _Z30__gen_ocl_work_group_broadcastdjj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_3D, _Z30__gen_ocl_work_group_broadcastdjjj)
+
+// work group reduce
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_DOUBLE, _Z31__gen_ocl_work_group_reduce_addd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_FLOAT, _Z31__gen_ocl_work_group_reduce_addf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_INT, _Z31__gen_ocl_work_group_reduce_addi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_UINT, _Z31__gen_ocl_work_group_reduce_addj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_LONG, _Z31__gen_ocl_work_group_reduce_addl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_ULONG, _Z31__gen_ocl_work_group_reduce_addm)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXD, _Z31__gen_ocl_work_group_reduce_maxd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXF, _Z31__gen_ocl_work_group_reduce_maxf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXI, _Z31__gen_ocl_work_group_reduce_maxi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXJ, _Z31__gen_ocl_work_group_reduce_maxj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXL, _Z31__gen_ocl_work_group_reduce_maxl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXM, _Z31__gen_ocl_work_group_reduce_maxm)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MIND, _Z31__gen_ocl_work_group_reduce_mind)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINF, _Z31__gen_ocl_work_group_reduce_minf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINI, _Z31__gen_ocl_work_group_reduce_mini)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINJ, _Z31__gen_ocl_work_group_reduce_minj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINL, _Z31__gen_ocl_work_group_reduce_minl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINM, _Z31__gen_ocl_work_group_reduce_minm)
+
+// work group scan_exclusive
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_DOUBLE, _Z39__gen_ocl_work_group_scan_exclusive_addd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_FLOAT, _Z39__gen_ocl_work_group_scan_exclusive_addf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_INT, _Z39__gen_ocl_work_group_scan_exclusive_addi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_UINT, _Z39__gen_ocl_work_group_scan_exclusive_addj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_LONG, _Z39__gen_ocl_work_group_scan_exclusive_addl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_ULONG, _Z39__gen_ocl_work_group_scan_exclusive_addm)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_DOUBLE, _Z39__gen_ocl_work_group_scan_exclusive_maxd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_FLOAT, _Z39__gen_ocl_work_group_scan_exclusive_maxf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_INT, _Z39__gen_ocl_work_group_scan_exclusive_maxi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_UINT, _Z39__gen_ocl_work_group_scan_exclusive_maxj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_LONG, _Z39__gen_ocl_work_group_scan_exclusive_maxl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAXM_ULONG, _Z39__gen_ocl_work_group_scan_exclusive_maxm)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_DOUBLE, _Z39__gen_ocl_work_group_scan_exclusive_mind)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_FLOAT, _Z39__gen_ocl_work_group_scan_exclusive_minf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_INT, _Z39__gen_ocl_work_group_scan_exclusive_mini)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_UINT, _Z39__gen_ocl_work_group_scan_exclusive_minj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_LONG, _Z39__gen_ocl_work_group_scan_exclusive_minl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_ULONG, _Z39__gen_ocl_work_group_scan_exclusive_minm)
+
+// work group scan_inclusive
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_DOUBLE, _Z39__gen_ocl_work_group_scan_inclusive_addd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_FLOAT, _Z39__gen_ocl_work_group_scan_inclusive_addf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_INT, _Z39__gen_ocl_work_group_scan_inclusive_addi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_UINT, _Z39__gen_ocl_work_group_scan_inclusive_addj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_LONG, _Z39__gen_ocl_work_group_scan_inclusive_addl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_ULONG, _Z39__gen_ocl_work_group_scan_inclusive_addm)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_DOUBLE, _Z39__gen_ocl_work_group_scan_inclusive_maxd)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_FLOAT, _Z39__gen_ocl_work_group_scan_inclusive_maxf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_INT, _Z39__gen_ocl_work_group_scan_inclusive_maxi)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_UINT, _Z39__gen_ocl_work_group_scan_inclusive_maxj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_LONG, _Z39__gen_ocl_work_group_scan_inclusive_maxl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_ULONG, _Z39__gen_ocl_work_group_scan_inclusive_maxm)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_DOUBLE, _Z39__gen_ocl_work_group_scan_inclusive_mind)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_FLOAT, _Z39__gen_ocl_work_group_scan_inclusive_minf)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_INT, _Z39__gen_ocl_work_group_scan_inclusive_mini)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_UINT, _Z39__gen_ocl_work_group_scan_inclusive_minj)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_LONG, _Z39__gen_ocl_work_group_scan_inclusive_minl)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_ULONG, _Z39__gen_ocl_work_group_scan_inclusive_minm)
+
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all)
+DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any)
-- 
1.7.9.5


From rong.r.yang at intel.com  Tue Mar 24 00:28:24 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Tue, 24 Mar 2015 07:28:24 +0000
Subject: [Beignet] [PATCH] Fix: Event callback that were not executed
 when	command was already CL_COMPLETE + thread safety for callbacks
In-Reply-To: <1427175497-15874-1-git-send-email-david.couturier@polymtl.ca>
References: <david.couturier@polymtl.ca>
 <1427175497-15874-1-git-send-email-david.couturier@polymtl.ca>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090141E8AD@shsmsx102.ccr.corp.intel.com>

Find one issue, see comment. 
And beignet use two space as indent, can you convert tab to space? Thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> David Couturier
> Sent: Tuesday, March 24, 2015 13:38
> To: beignet at lists.freedesktop.org
> Cc: David Couturier
> Subject: [Beignet] [PATCH] Fix: Event callback that were not executed when
> command was already CL_COMPLETE + thread safety for callbacks
> 
> When trying to register a callback on the clEnqueueReadBuffer command,
> since it is processed synchroniously all the time, the command was marked
> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> was then used to register a callback function, the callback function did no
> check to execute it if nessary.
> 
> Modified the handling of the callback registration in cl_set_event_callback to
> only call the callback being created if it's status is already reached.
> 
> Added thread safety measures for pfn_notify calls since the status value can
> be changed while executing the callback.
> 
> Grouped the pfn_notify calls to a unified function cl_event_call_callback that
> handles thread safety: it queues callbacks in a node list while under the
> protection of pthread_mutex and then calls the callbacks outside of the
> pthread_mutex (this is required because the callback can deadlock if it calls a
> cl_api function that uses the mutex)
> 
> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> ---
>  src/cl_event.c | 77 ++++++++++++++++++++++++++++++++++++++++++---
> -------------
>  src/cl_event.h |  4 ++-
>  2 files changed, 59 insertions(+), 22 deletions(-)
> 
> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..eb5d54b 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
>      event->queue->last_event = NULL;
> 
>    /* Call all user's callback if haven't execute */
> -  user_callback *cb = event->user_cb;
> -  while(event->user_cb) {
> -    cb = event->user_cb;
> -    if(cb->executed == CL_FALSE) {
> -      cb->executed = CL_TRUE;
> -      cb->pfn_notify(event, event->status, cb->user_data);
> -    }
> -    event->user_cb = cb->next;
> -    cl_free(cb);
> -  }
> +  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE
> + status will force all callbacks that are not executed to run
> 
>    /* delete gpgpu event object */
>    if(event->gpgpu_event)
> @@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
>    cb->status      = command_exec_callback_type;
>    cb->executed    = CL_FALSE;
> 
> -  cb->next        = event->user_cb;
> -  event->user_cb  = cb;
> +
> +  // It is possible that the event enqueued is already completed.
> +  // clEnqueueReadBuffer can be synchronous and when the callback
> +  // is registered after, it still needs to get executed.
> +  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety
> +required: operations on the event->status can be made from many
> +different threads
> +  if(event->status <= command_exec_callback_type) {
> +	  /* Call user callback */
> +	  pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify
> can call clFunctions that use the event_lock and from here it's not required
> +	  cb->pfn_notify(event, event->status, cb->user_data);
> +	  cl_free(cb);
> +  } else {
> +	  // Enqueue to callback list
> +	  cb->next        = event->user_cb;
> +	  event->user_cb  = cb;
> +	  pthread_mutex_unlock(&event->ctx->event_lock);
> +  }
> 
>  exit:
>    return err;
> @@ -388,9 +393,46 @@ error:
>    goto exit;
>  }
> 
> +void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) {
> +	user_callback *user_cb = NULL;
> +	user_callback *queue_cb = NULL; // For thread safety, we create a
> queue that holds user_callback's pfn_notify contents
> +	user_callback *temp_cb = NULL;
> +	user_cb = event->user_cb;
> +	pthread_mutex_lock(&event->ctx->event_lock);
> +	while(user_cb) {
> +		if(user_cb->status >= status
> +				&& user_cb->executed == CL_FALSE) { //
> Added check to not execute a callback when it was already handled
> +			user_cb->executed = CL_TRUE;
> +			temp_cb = cl_malloc(sizeof(user_callback));
> +			if(!temp_cb) {
> +				break; // Out of memory
> +			}
> +			temp_cb->pfn_notify = user_cb->pfn_notify; //
> Minor struct copy to call ppfn_notify out of the pthread_mutex
> +			temp_cb->user_data = user_cb->user_data;
> +			if(free_cb) {
> +				cl_free(user_cb);
> +			}
> +			if(!queue_cb) {
> +				queue_cb = temp_cb;
> +				queue_cb->next = NULL;
> +			} else { // Enqueue
> +				temp_cb->next = queue_cb;
Should be temp_cb->next = queue_cb->next here.

> +				queue_cb->next = temp_cb;
> +			}
> +		}
> +		user_cb = user_cb->next;
> +	}
> +	pthread_mutex_unlock(&event->ctx->event_lock);
> +	// Calling the callbacks outside of the event_lock is required because
> the callback can call cl_api functions and get deadlocked
> +	while(queue_cb) { // For each callback queued, actually execute the
> callback
> +		queue_cb->pfn_notify(event, event->status, queue_cb-
> >user_data);
> +		temp_cb = queue_cb;
> +		queue_cb = queue_cb->next;
> +		cl_free(temp_cb);
> +	}
> +}
>  void cl_event_set_status(cl_event event, cl_int status)  {
> -  user_callback *user_cb;
>    cl_int ret, i;
>    cl_event evt;
> 
> @@ -437,14 +479,7 @@ void cl_event_set_status(cl_event event, cl_int
> status)
>    pthread_mutex_unlock(&event->ctx->event_lock);
> 
>    /* Call user callback */
> -  user_cb = event->user_cb;
> -  while(user_cb) {
> -    if(user_cb->status >= status) {
> -      user_cb->executed = CL_TRUE;
> -      user_cb->pfn_notify(event, event->status, user_cb->user_data);
> -    }
> -    user_cb = user_cb->next;
> -  }
> +  cl_event_call_callback(event, status, CL_FALSE);
> 
>    if(event->type == CL_COMMAND_USER) {
>      /* Check all defer enqueue */
> diff --git a/src/cl_event.h b/src/cl_event.h index 0730530..9bb2ac8 100644
> --- a/src/cl_event.h
> +++ b/src/cl_event.h
> @@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context,
> cl_command_queue, cl_command_type, cl_bool);  void
> cl_event_delete(cl_event);
>  /* Add one more reference to this object */  void
> cl_event_add_ref(cl_event);
> -/* Rigister a user callback function for specific commond execution status */
> +/* Register a user callback function for specific commond execution
> +status */
>  cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
> +/* Execute the event's callback if the event's status supersedes the
> +callback's status. Free the callback if specified */ void
> +cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
>  /* Check events wait list for enqueue commonds */  cl_int
> cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
>  /* Wait the all events in wait list complete */
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From rong.r.yang at intel.com  Tue Mar 24 00:39:03 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Tue, 24 Mar 2015 07:39:03 +0000
Subject: [Beignet] [PATCH OpenCL 2.0] Backend: Update the
	workgroup	instructions for llvm backend to gen.
In-Reply-To: <1427179201-4467-1-git-send-email-junyan.he@inbox.com>
References: <1427179201-4467-1-git-send-email-junyan.he@inbox.com>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090141E8CE@shsmsx102.ccr.corp.intel.com>

Zhigang have add function OCLIntrinsicMap.find to handle override function name, only need one DECL_LLVM_GEN_FUNCTION for one group  override functions, and in the GenWriter::emitCallInst to get the corresponding argument type. It reduce the DECL_LLVM_GEN_FUNCTION significant. Can you also use this method?

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> junyan.he at inbox.com
> Sent: Tuesday, March 24, 2015 14:40
> To: beignet at lists.freedesktop.org
> Cc: Junyan He
> Subject: [Beignet] [PATCH OpenCL 2.0] Backend: Update the workgroup
> instructions for llvm backend to gen.
> 
> From: Junyan He <junyan.he at linux.intel.com>
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  backend/src/llvm/llvm_gen_ocl_function.hxx |   87
> ++++++++++++++++++++++++++++
>  1 file changed, 87 insertions(+)
> 
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 9536a3c..947fadc 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -160,3 +160,90 @@ DECL_LLVM_GEN_FUNCTION(REGION,
> __gen_ocl_region)
> 
>  // printf function
>  DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
> +
> +// work group function
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_1D,
> +_Z30__gen_ocl_work_group_broadcastij)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_2D,
> +_Z30__gen_ocl_work_group_broadcastijj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_INT_3D,
> +_Z30__gen_ocl_work_group_broadcastijjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_1D,
> +_Z30__gen_ocl_work_group_broadcastjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_2D,
> +_Z30__gen_ocl_work_group_broadcastjjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_UINT_3D,
> +_Z30__gen_ocl_work_group_broadcastjjjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_1D,
> +_Z30__gen_ocl_work_group_broadcastlj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_LONG_2D,
> +_Z30__gen_ocl_work_group_broadcastljj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_lONG_3D,
> +_Z30__gen_ocl_work_group_broadcastljjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_1D,
> +_Z30__gen_ocl_work_group_broadcastmj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_2D,
> +_Z30__gen_ocl_work_group_broadcastmjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_ULONG_3D,
> +_Z30__gen_ocl_work_group_broadcastmjjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_1D,
> +_Z30__gen_ocl_work_group_broadcastfj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_2D,
> +_Z30__gen_ocl_work_group_broadcastfjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_FLOAT_3D,
> +_Z30__gen_ocl_work_group_broadcastfjjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_1D,
> +_Z30__gen_ocl_work_group_broadcastdj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_2D,
> +_Z30__gen_ocl_work_group_broadcastdjj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_BROADCAST_DOUBLE_3D,
> +_Z30__gen_ocl_work_group_broadcastdjjj)
> +
> +// work group reduce
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_DOUBLE,
> +_Z31__gen_ocl_work_group_reduce_addd)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_FLOAT,
> +_Z31__gen_ocl_work_group_reduce_addf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_INT,
> +_Z31__gen_ocl_work_group_reduce_addi)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_UINT,
> +_Z31__gen_ocl_work_group_reduce_addj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_LONG,
> +_Z31__gen_ocl_work_group_reduce_addl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_ADD_ULONG,
> +_Z31__gen_ocl_work_group_reduce_addm)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXD,
> +_Z31__gen_ocl_work_group_reduce_maxd)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXF,
> +_Z31__gen_ocl_work_group_reduce_maxf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXI,
> +_Z31__gen_ocl_work_group_reduce_maxi)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXJ,
> +_Z31__gen_ocl_work_group_reduce_maxj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXL,
> +_Z31__gen_ocl_work_group_reduce_maxl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MAXM,
> +_Z31__gen_ocl_work_group_reduce_maxm)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MIND,
> +_Z31__gen_ocl_work_group_reduce_mind)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINF,
> +_Z31__gen_ocl_work_group_reduce_minf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINI,
> +_Z31__gen_ocl_work_group_reduce_mini)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINJ,
> +_Z31__gen_ocl_work_group_reduce_minj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINL,
> +_Z31__gen_ocl_work_group_reduce_minl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_REDUCE_MINM,
> +_Z31__gen_ocl_work_group_reduce_minm)
> +
> +// work group scan_exclusive
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_DO
> UBLE,
> +_Z39__gen_ocl_work_group_scan_exclusive_addd)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_FLO
> AT,
> +_Z39__gen_ocl_work_group_scan_exclusive_addf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_INT,
> +_Z39__gen_ocl_work_group_scan_exclusive_addi)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_UIN
> T,
> +_Z39__gen_ocl_work_group_scan_exclusive_addj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_LO
> NG,
> +_Z39__gen_ocl_work_group_scan_exclusive_addl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_ADD_UL
> ONG,
> +_Z39__gen_ocl_work_group_scan_exclusive_addm)
> +
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_DO
> UBLE,
> +_Z39__gen_ocl_work_group_scan_exclusive_maxd)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_FL
> OAT,
> +_Z39__gen_ocl_work_group_scan_exclusive_maxf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_INT,
> +_Z39__gen_ocl_work_group_scan_exclusive_maxi)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_UI
> NT,
> +_Z39__gen_ocl_work_group_scan_exclusive_maxj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAX_LO
> NG,
> +_Z39__gen_ocl_work_group_scan_exclusive_maxl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MAXM_
> ULONG,
> +_Z39__gen_ocl_work_group_scan_exclusive_maxm)
> +
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_DO
> UBLE,
> +_Z39__gen_ocl_work_group_scan_exclusive_mind)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_FLO
> AT,
> +_Z39__gen_ocl_work_group_scan_exclusive_minf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_INT,
> +_Z39__gen_ocl_work_group_scan_exclusive_mini)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_UIN
> T,
> +_Z39__gen_ocl_work_group_scan_exclusive_minj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_LO
> NG,
> +_Z39__gen_ocl_work_group_scan_exclusive_minl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_EXCLUSIVE_MIN_UL
> ONG,
> +_Z39__gen_ocl_work_group_scan_exclusive_minm)
> +
> +// work group scan_inclusive
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_DO
> UBLE,
> +_Z39__gen_ocl_work_group_scan_inclusive_addd)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_FLO
> AT,
> +_Z39__gen_ocl_work_group_scan_inclusive_addf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_INT,
> +_Z39__gen_ocl_work_group_scan_inclusive_addi)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_UIN
> T,
> +_Z39__gen_ocl_work_group_scan_inclusive_addj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_LO
> NG,
> +_Z39__gen_ocl_work_group_scan_inclusive_addl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_ADD_UL
> ONG,
> +_Z39__gen_ocl_work_group_scan_inclusive_addm)
> +
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_DO
> UBLE,
> +_Z39__gen_ocl_work_group_scan_inclusive_maxd)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_FL
> OAT,
> +_Z39__gen_ocl_work_group_scan_inclusive_maxf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_INT,
> +_Z39__gen_ocl_work_group_scan_inclusive_maxi)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_UI
> NT,
> +_Z39__gen_ocl_work_group_scan_inclusive_maxj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_LO
> NG,
> +_Z39__gen_ocl_work_group_scan_inclusive_maxl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MAX_UL
> ONG,
> +_Z39__gen_ocl_work_group_scan_inclusive_maxm)
> +
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_DO
> UBLE,
> +_Z39__gen_ocl_work_group_scan_inclusive_mind)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_FLO
> AT,
> +_Z39__gen_ocl_work_group_scan_inclusive_minf)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_INT,
> +_Z39__gen_ocl_work_group_scan_inclusive_mini)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_UIN
> T,
> +_Z39__gen_ocl_work_group_scan_inclusive_minj)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_LO
> NG,
> +_Z39__gen_ocl_work_group_scan_inclusive_minl)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN_UL
> ONG,
> +_Z39__gen_ocl_work_group_scan_inclusive_minm)
> +
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL,
> __gen_ocl_work_group_all)
> +DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY,
> __gen_ocl_work_group_any)
> --
> 1.7.9.5
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From david.couturier at polymtl.ca  Tue Mar 24 10:22:12 2015
From: david.couturier at polymtl.ca (David Couturier)
Date: Tue, 24 Mar 2015 13:22:12 -0400
Subject: [Beignet] [PATCH] Fix: (v3) Event callback that were not executed
	when command was already CL_COMPLETE + thread safety for callbacks
In-Reply-To: <david.couturier@polymtl.ca>
References: <david.couturier@polymtl.ca>
Message-ID: <1427217732-22745-1-git-send-email-david.couturier@polymtl.ca>

When trying to register a callback on the clEnqueueReadBuffer command, since it is processed
synchroniously all the time, the command was marked CL_COMPLETE every time. If the event returned
by clEnqueueReadBuffer was then used to register a callback function, the callback function did
no check to execute it if nessary.

Modified the handling of the callback registration in cl_set_event_callback to only call the callback being created if it's status is already reached.

Added thread safety measures for pfn_notify calls since the status value can be changed while executing the callback.

Grouped the pfn_notify calls to a unified function cl_event_call_callback that handles thread safety: it queues callbacks in a node list while under the protection of pthread_mutex and then calls the callbacks outside of the pthread_mutex (this is required because the callback can deadlock if it calls a cl_api function that uses the mutex)

Signed-off-by: David Couturier <david.couturier at polymtl.ca>
---
 src/cl_event.c | 79 ++++++++++++++++++++++++++++++++++++++++++----------------
 src/cl_event.h |  4 ++-
 2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/src/cl_event.c b/src/cl_event.c
index f70e531..bba14ba 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
     event->queue->last_event = NULL;
 
   /* Call all user's callback if haven't execute */
-  user_callback *cb = event->user_cb;
-  while(event->user_cb) {
-    cb = event->user_cb;
-    if(cb->executed == CL_FALSE) {
-      cb->executed = CL_TRUE;
-      cb->pfn_notify(event, event->status, cb->user_data);
-    }
-    event->user_cb = cb->next;
-    cl_free(cb);
-  }
+  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run
 
   /* delete gpgpu event object */
   if(event->gpgpu_event)
@@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
   cb->status      = command_exec_callback_type;
   cb->executed    = CL_FALSE;
 
-  cb->next        = event->user_cb;
-  event->user_cb  = cb;
+
+  // It is possible that the event enqueued is already completed.
+  // clEnqueueReadBuffer can be synchronous and when the callback
+  // is registered after, it still needs to get executed.
+  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety required: operations on the event->status can be made from many different threads
+  if(event->status <= command_exec_callback_type) {
+    /* Call user callback */
+    pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify can call clFunctions that use the event_lock and from here it's not required
+    cb->pfn_notify(event, event->status, cb->user_data);
+    cl_free(cb);
+  } else {
+    // Enqueue to callback list
+    cb->next        = event->user_cb;
+    event->user_cb  = cb;
+    pthread_mutex_unlock(&event->ctx->event_lock);
+  }
 
 exit:
   return err;
@@ -388,9 +393,48 @@ error:
   goto exit;
 }
 
+void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) {
+  user_callback *user_cb = NULL;
+  user_callback *queue_cb = NULL; // For thread safety, we create a queue that holds user_callback's pfn_notify contents
+  user_callback *temp_cb = NULL;
+  user_cb = event->user_cb;
+  pthread_mutex_lock(&event->ctx->event_lock);
+  while(user_cb) {
+    if(user_cb->status >= status
+        && user_cb->executed == CL_FALSE) { // Added check to not execute a callback when it was already handled
+      user_cb->executed = CL_TRUE;
+      temp_cb = cl_malloc(sizeof(user_callback));
+      if(!temp_cb) {
+        break; // Out of memory
+      }
+      temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call ppfn_notify out of the pthread_mutex
+      temp_cb->user_data = user_cb->user_data;
+      if(free_cb) {
+        cl_free(user_cb);
+      }
+      if(!queue_cb) {
+        queue_cb = temp_cb;
+        queue_cb->next = NULL;
+      } else { // Enqueue First
+        temp_cb->next = queue_cb;
+        queue_cb = temp_cb;
+      }
+    }
+    user_cb = user_cb->next;
+  }
+  pthread_mutex_unlock(&event->ctx->event_lock);
+
+  // Calling the callbacks outside of the event_lock is required because the callback can call cl_api functions and get deadlocked
+  while(queue_cb) { // For each callback queued, actually execute the callback
+    queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
+    temp_cb = queue_cb;
+    queue_cb = queue_cb->next;
+    cl_free(temp_cb);
+  }
+}
+
 void cl_event_set_status(cl_event event, cl_int status)
 {
-  user_callback *user_cb;
   cl_int ret, i;
   cl_event evt;
 
@@ -437,14 +481,7 @@ void cl_event_set_status(cl_event event, cl_int status)
   pthread_mutex_unlock(&event->ctx->event_lock);
 
   /* Call user callback */
-  user_cb = event->user_cb;
-  while(user_cb) {
-    if(user_cb->status >= status) {
-      user_cb->executed = CL_TRUE;
-      user_cb->pfn_notify(event, event->status, user_cb->user_data);
-    }
-    user_cb = user_cb->next;
-  }
+  cl_event_call_callback(event, status, CL_FALSE);
 
   if(event->type == CL_COMMAND_USER) {
     /* Check all defer enqueue */
diff --git a/src/cl_event.h b/src/cl_event.h
index 0730530..9bb2ac8 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
 void cl_event_delete(cl_event);
 /* Add one more reference to this object */
 void cl_event_add_ref(cl_event);
-/* Rigister a user callback function for specific commond execution status */
+/* Register a user callback function for specific commond execution status */
 cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Execute the event's callback if the event's status supersedes the callback's status. Free the callback if specified */
+void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
 /* Check events wait list for enqueue commonds */
 cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
 /* Wait the all events in wait list complete */
-- 
1.9.1


From rong.r.yang at intel.com  Wed Mar 25 04:49:46 2015
From: rong.r.yang at intel.com (Yang, Rong R)
Date: Wed, 25 Mar 2015 11:49:46 +0000
Subject: [Beignet] [PATCH] Fix: (v3) Event callback that were not
 executed	when command was already CL_COMPLETE + thread safety for callbacks
In-Reply-To: <1427217732-22745-1-git-send-email-david.couturier@polymtl.ca>
References: <david.couturier@polymtl.ca>
 <1427217732-22745-1-git-send-email-david.couturier@polymtl.ca>
Message-ID: <7597C9376C272A4AB2D29E91550B7B090141EABA@shsmsx102.ccr.corp.intel.com>

LGTM, thanks.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> David Couturier
> Sent: Wednesday, March 25, 2015 01:22
> To: beignet at lists.freedesktop.org
> Cc: David Couturier
> Subject: [Beignet] [PATCH] Fix: (v3) Event callback that were not executed
> when command was already CL_COMPLETE + thread safety for callbacks
> 
> When trying to register a callback on the clEnqueueReadBuffer command,
> since it is processed synchroniously all the time, the command was marked
> CL_COMPLETE every time. If the event returned by clEnqueueReadBuffer
> was then used to register a callback function, the callback function did no
> check to execute it if nessary.
> 
> Modified the handling of the callback registration in cl_set_event_callback to
> only call the callback being created if it's status is already reached.
> 
> Added thread safety measures for pfn_notify calls since the status value can
> be changed while executing the callback.
> 
> Grouped the pfn_notify calls to a unified function cl_event_call_callback that
> handles thread safety: it queues callbacks in a node list while under the
> protection of pthread_mutex and then calls the callbacks outside of the
> pthread_mutex (this is required because the callback can deadlock if it calls a
> cl_api function that uses the mutex)
> 
> Signed-off-by: David Couturier <david.couturier at polymtl.ca>
> ---
>  src/cl_event.c | 79 ++++++++++++++++++++++++++++++++++++++++++---
> -------------
>  src/cl_event.h |  4 ++-
>  2 files changed, 61 insertions(+), 22 deletions(-)
> 
> diff --git a/src/cl_event.c b/src/cl_event.c index f70e531..bba14ba 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -119,16 +119,7 @@ void cl_event_delete(cl_event event)
>      event->queue->last_event = NULL;
> 
>    /* Call all user's callback if haven't execute */
> -  user_callback *cb = event->user_cb;
> -  while(event->user_cb) {
> -    cb = event->user_cb;
> -    if(cb->executed == CL_FALSE) {
> -      cb->executed = CL_TRUE;
> -      cb->pfn_notify(event, event->status, cb->user_data);
> -    }
> -    event->user_cb = cb->next;
> -    cl_free(cb);
> -  }
> +  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE
> + status will force all callbacks that are not executed to run
> 
>    /* delete gpgpu event object */
>    if(event->gpgpu_event)
> @@ -180,8 +171,22 @@ cl_int cl_event_set_callback(cl_event event ,
>    cb->status      = command_exec_callback_type;
>    cb->executed    = CL_FALSE;
> 
> -  cb->next        = event->user_cb;
> -  event->user_cb  = cb;
> +
> +  // It is possible that the event enqueued is already completed.
> +  // clEnqueueReadBuffer can be synchronous and when the callback  //
> + is registered after, it still needs to get executed.
> +  pthread_mutex_lock(&event->ctx->event_lock); // Thread safety
> + required: operations on the event->status can be made from many
> different threads  if(event->status <= command_exec_callback_type) {
> +    /* Call user callback */
> +    pthread_mutex_unlock(&event->ctx->event_lock); // pfn_notify can call
> clFunctions that use the event_lock and from here it's not required
> +    cb->pfn_notify(event, event->status, cb->user_data);
> +    cl_free(cb);
> +  } else {
> +    // Enqueue to callback list
> +    cb->next        = event->user_cb;
> +    event->user_cb  = cb;
> +    pthread_mutex_unlock(&event->ctx->event_lock);
> +  }
> 
>  exit:
>    return err;
> @@ -388,9 +393,48 @@ error:
>    goto exit;
>  }
> 
> +void cl_event_call_callback(cl_event event, cl_int status, cl_bool
> +free_cb) {
> +  user_callback *user_cb = NULL;
> +  user_callback *queue_cb = NULL; // For thread safety, we create a
> +queue that holds user_callback's pfn_notify contents
> +  user_callback *temp_cb = NULL;
> +  user_cb = event->user_cb;
> +  pthread_mutex_lock(&event->ctx->event_lock);
> +  while(user_cb) {
> +    if(user_cb->status >= status
> +        && user_cb->executed == CL_FALSE) { // Added check to not execute a
> callback when it was already handled
> +      user_cb->executed = CL_TRUE;
> +      temp_cb = cl_malloc(sizeof(user_callback));
> +      if(!temp_cb) {
> +        break; // Out of memory
> +      }
> +      temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call
> ppfn_notify out of the pthread_mutex
> +      temp_cb->user_data = user_cb->user_data;
> +      if(free_cb) {
> +        cl_free(user_cb);
> +      }
> +      if(!queue_cb) {
> +        queue_cb = temp_cb;
> +        queue_cb->next = NULL;
> +      } else { // Enqueue First
> +        temp_cb->next = queue_cb;
> +        queue_cb = temp_cb;
> +      }
> +    }
> +    user_cb = user_cb->next;
> +  }
> +  pthread_mutex_unlock(&event->ctx->event_lock);
> +
> +  // Calling the callbacks outside of the event_lock is required
> +because the callback can call cl_api functions and get deadlocked
> +  while(queue_cb) { // For each callback queued, actually execute the
> callback
> +    queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
> +    temp_cb = queue_cb;
> +    queue_cb = queue_cb->next;
> +    cl_free(temp_cb);
> +  }
> +}
> +
>  void cl_event_set_status(cl_event event, cl_int status)  {
> -  user_callback *user_cb;
>    cl_int ret, i;
>    cl_event evt;
> 
> @@ -437,14 +481,7 @@ void cl_event_set_status(cl_event event, cl_int
> status)
>    pthread_mutex_unlock(&event->ctx->event_lock);
> 
>    /* Call user callback */
> -  user_cb = event->user_cb;
> -  while(user_cb) {
> -    if(user_cb->status >= status) {
> -      user_cb->executed = CL_TRUE;
> -      user_cb->pfn_notify(event, event->status, user_cb->user_data);
> -    }
> -    user_cb = user_cb->next;
> -  }
> +  cl_event_call_callback(event, status, CL_FALSE);
> 
>    if(event->type == CL_COMMAND_USER) {
>      /* Check all defer enqueue */
> diff --git a/src/cl_event.h b/src/cl_event.h index 0730530..9bb2ac8 100644
> --- a/src/cl_event.h
> +++ b/src/cl_event.h
> @@ -78,8 +78,10 @@ cl_event cl_event_new(cl_context,
> cl_command_queue, cl_command_type, cl_bool);  void
> cl_event_delete(cl_event);
>  /* Add one more reference to this object */  void
> cl_event_add_ref(cl_event);
> -/* Rigister a user callback function for specific commond execution status */
> +/* Register a user callback function for specific commond execution
> +status */
>  cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
> +/* Execute the event's callback if the event's status supersedes the
> +callback's status. Free the callback if specified */ void
> +cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
>  /* Check events wait list for enqueue commonds */  cl_int
> cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
>  /* Wait the all events in wait list complete */
> --
> 1.9.1
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet

From yejun.guo at intel.com  Thu Mar 26 23:16:42 2015
From: yejun.guo at intel.com (Guo, Yejun)
Date: Fri, 27 Mar 2015 06:16:42 +0000
Subject: [Beignet] [PATCH 1/2] add 3 simd level built-in functions:
 shuffle, simdsize and simdid
In-Reply-To: <1426831053-8431-1-git-send-email-yejun.guo@intel.com>
References: <1426831053-8431-1-git-send-email-yejun.guo@intel.com>
Message-ID: <854E8DBA9F41904AB047E03BB6963AE501BF2463@SHSMSX101.ccr.corp.intel.com>

Ask for review, thanks.

yejun

-----Original Message-----
From: Guo, Yejun 
Sent: Friday, March 20, 2015 1:58 PM
To: beignet at lists.freedesktop.org
Cc: Guo, Yejun
Subject: [PATCH 1/2] add 3 simd level built-in functions: shuffle, simdsize and simdid

uint __gen_ocl_get_simd_size();
returns 8 if SIMD8, returns 16 if SIMD16

uint __gen_ocl_get_simd_id();
return value ranges from 0 to simdsize - 1

floatN __gen_ocl_simd_shuffle(floatN x, uint c);
intN   __gen_ocl_simd_shuffle(intN x, uint c);
uintN  __gen_ocl_simd_shuffle(uintN x, uint c); the value of x of the c-th channel of the SIMD is returned, for all SIMD channels, the behavior is undefined if c is larger than simdsize - 1

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
---
 backend/src/backend/gen8_context.cpp               |  29 ++++-
 backend/src/backend/gen_context.cpp                | 127 +++++++++++++++------
 backend/src/backend/gen_context.hpp                |   1 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |   1 +
 backend/src/backend/gen_insn_selection.cpp         |  60 ++++++++++
 backend/src/backend/gen_insn_selection.hxx         |   2 +
 backend/src/backend/program.h                      |   1 +
 backend/src/ir/context.hpp                         |   6 +
 backend/src/ir/instruction.cpp                     |  32 ++++++
 backend/src/ir/instruction.hpp                     |  17 +++
 backend/src/ir/instruction.hxx                     |   3 +
 backend/src/ir/liveness.cpp                        |   5 +
 backend/src/ir/profile.cpp                         |   2 +
 backend/src/ir/profile.hpp                         |   5 +-
 backend/src/libocl/CMakeLists.txt                  |   2 +-
 backend/src/libocl/include/ocl.h                   |   1 +
 backend/src/libocl/include/ocl_misc.h              |   8 --
 backend/src/libocl/script/ocl_simd.def             |   4 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl           |  19 +++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h            |  34 ++++++
 backend/src/llvm/llvm_gen_backend.cpp              |  27 +++++
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   4 +
 src/cl_command_queue_gen7.c                        |   8 ++
 23 files changed, 351 insertions(+), 47 deletions(-)  create mode 100644 backend/src/libocl/script/ocl_simd.def
 create mode 100644 backend/src/libocl/tmpl/ocl_simd.tmpl.cl
 create mode 100644 backend/src/libocl/tmpl/ocl_simd.tmpl.h

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 3f57cf6..144fd00 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -240,6 +240,9 @@ namespace gbe
   }
 
   void Gen8Context::emitBinaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
     switch (insn.opcode) {
       case SEL_OP_SEL_INT64:
       case SEL_OP_I64AND:
@@ -250,14 +253,34 @@ namespace gbe
         break;
       case SEL_OP_UPSAMPLE_LONG:
       {
-        const GenRegister dst = ra->genReg(insn.dst(0));
-        const GenRegister src0 = ra->genReg(insn.src(0));
-        const GenRegister src1 = ra->genReg(insn.src(1));
         p->MOV(dst, src0);
         p->SHL(dst, dst, GenRegister::immud(32));
         p->ADD(dst, dst, src1);
         break;
       }
+      case SEL_OP_SIMD_SHUFFLE:
+      {
+        uint32_t simd = p->curr.execWidth;
+        if (src1.file == GEN_IMMEDIATE_VALUE) {
+          uint32_t offset = src1.value.ud % simd;
+          uint32_t nr = src0.nr;
+          uint32_t subnr = src0.subnr;
+          subnr = subnr + offset;
+          if (subnr > 8) {
+            nr = nr + 1;
+            subnr = subnr - 8;
+          }
+          p->MOV(dst, GenRegister::ud1grf(nr, subnr));
+        } else {
+          uint32_t base = src0.nr * 32 + src0.subnr * 4;
+          GenRegister baseReg = GenRegister::immuw(base);
+          const GenRegister a0 = GenRegister::addr8(0);
+          p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+          GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+          p->MOV(dst, indirect);
+        }
+        break;
+      }
       default:
         GenContext::emitBinaryInstruction(insn);
     }
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index cdf581c..25c7a5a 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -198,6 +198,22 @@ namespace gbe
     this->labelPos.insert(std::make_pair(label, p->store.size()));
   }
 
+  void GenContext::emitNullaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    switch (insn.opcode) {
+      case SEL_OP_SIMD_ID:
+        {
+          const GenRegister selLaneID = this->simdWidth == 8 ?
+                                GenRegister::ud8grf(ir::ocl::laneid) :
+                                GenRegister::ud16grf(ir::ocl::laneid);
+          const GenRegister laneID = ra->genReg(selLaneID);
+          p->MOV(dst, laneID);
+        }
+        break;
+      default: NOT_IMPLEMENTED;
+    }
+  }
+
   void GenContext::emitUnaryInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister src = ra->genReg(insn.src(0)); @@ -583,6 +599,46 @@ namespace gbe
           p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
         }
         break;
+      case SEL_OP_SIMD_SHUFFLE:
+        {
+          uint32_t simd = p->curr.execWidth;
+          if (src1.file == GEN_IMMEDIATE_VALUE) {
+            uint32_t offset = src1.value.ud % simd;
+            uint32_t nr = src0.nr;
+            uint32_t subnr = src0.subnr;
+            subnr = subnr + offset;
+            if (subnr > 8) {
+              nr = nr + 1;
+              subnr = subnr - 8;
+            }
+            p->MOV(dst, GenRegister::ud1grf(nr, subnr));
+          } else {
+            uint32_t base = src0.nr * 32 + src0.subnr * 4;
+            GenRegister baseReg = GenRegister::immuw(base);
+            const GenRegister a0 = GenRegister::addr8(0);
+
+            p->push();
+              if (simd == 8) {
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+                p->MOV(dst, indirect);
+              }
+              else if (simd == 16) {
+                p->curr.execWidth = 8;
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                GenRegister indirect = GenRegister::to_indirect1xN(src0, 0, 0);
+                p->MOV(dst, indirect);
+
+                p->curr.quarterControl = 1;
+                p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg);
+                p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+              }
+              else
+                NOT_IMPLEMENTED;
+            p->pop();
+          }
+        }
+        break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -2023,41 +2079,46 @@ namespace gbe
     } else
   
     fn.foreachInstruction([&](ir::Instruction &insn) {
-      const uint32_t srcNum = insn.getSrcNum();
-      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
-        const ir::Register reg = insn.getSrc(srcID);
-        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
-          if (srcID != 0) continue;
-          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
-          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key(bti, type);
-          const ir::Register imageInfo = insn.getSrc(0);
-          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
-            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            insertCurbeReg(imageInfo, offset);
+      if (insn.getOpcode() == ir::OP_SIMD_ID) {
+        if (curbeRegs.find(laneid) == curbeRegs.end())
+          allocCurbeReg(laneid, GBE_CURBE_LANE_ID);
+      } else {
+        const uint32_t srcNum = insn.getSrcNum();
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const ir::Register reg = insn.getSrc(srcID);
+          if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+            if (srcID != 0) continue;
+            const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+            const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+            ir::ImageInfoKey key(bti, type);
+            const ir::Register imageInfo = insn.getSrc(0);
+            if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+              uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+              insertCurbeReg(imageInfo, offset);
+            }
+            continue;
           }
-          continue;
+          if (fn.isSpecialReg(reg) == false) continue;
+          if (curbeRegs.find(reg) != curbeRegs.end()) continue;
+          if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
+          INSERT_REG(lsize0, LOCAL_SIZE_X)
+          INSERT_REG(lsize1, LOCAL_SIZE_Y)
+          INSERT_REG(lsize2, LOCAL_SIZE_Z)
+          INSERT_REG(gsize0, GLOBAL_SIZE_X)
+          INSERT_REG(gsize1, GLOBAL_SIZE_Y)
+          INSERT_REG(gsize2, GLOBAL_SIZE_Z)
+          INSERT_REG(goffset0, GLOBAL_OFFSET_X)
+          INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
+          INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
+          INSERT_REG(workdim, WORK_DIM)
+          INSERT_REG(numgroup0, GROUP_NUM_X)
+          INSERT_REG(numgroup1, GROUP_NUM_Y)
+          INSERT_REG(numgroup2, GROUP_NUM_Z)
+          INSERT_REG(stackptr, STACK_POINTER)
+          INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
+          INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+          do {} while(0);
         }
-        if (fn.isSpecialReg(reg) == false) continue;
-        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
-        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
-        INSERT_REG(lsize0, LOCAL_SIZE_X)
-        INSERT_REG(lsize1, LOCAL_SIZE_Y)
-        INSERT_REG(lsize2, LOCAL_SIZE_Z)
-        INSERT_REG(gsize0, GLOBAL_SIZE_X)
-        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
-        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
-        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
-        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
-        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
-        INSERT_REG(workdim, WORK_DIM)
-        INSERT_REG(numgroup0, GROUP_NUM_X)
-        INSERT_REG(numgroup1, GROUP_NUM_Y)
-        INSERT_REG(numgroup2, GROUP_NUM_Z)
-        INSERT_REG(stackptr, STACK_POINTER)
-        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
-        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
-        do {} while(0);
       }
     });
 #undef INSERT_REG
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 6ca88db..3ac675e 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -124,6 +124,7 @@ namespace gbe
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
+    virtual void emitNullaryInstruction(const SelectionInstruction 
+ &insn);
     virtual void emitUnaryInstruction(const SelectionInstruction &insn);
     virtual void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
     virtual void emitBinaryInstruction(const SelectionInstruction &insn); diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index d054820..fd7e1a4 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -1,5 +1,6 @@
 //                 Family     Latency     SIMD16     SIMD8
 DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
+DECL_GEN7_SCHEDULE(Nullary,         20,        4,        2)
 DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
 DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
 DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index c240261..1586098 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -477,6 +477,8 @@ namespace gbe
     /*! To make function prototypes more readable */
     typedef const GenRegister &Reg;
 
+#define ALU0(OP) \
+  INLINE void OP(Reg dst) { ALU0(SEL_OP_##OP, dst); }
 #define ALU1(OP) \
   INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }  #define ALU1WithTemp(OP) \ @@ -530,12 +532,15 @@ namespace gbe
     ALU2WithTemp(HADD)
     ALU2WithTemp(RHADD)
     ALU2(UPSAMPLE_LONG)
+    ALU2(SIMD_SHUFFLE)
+    ALU0(SIMD_ID)
     ALU1WithTemp(CONVI_TO_I64)
     ALU1WithTemp(CONVF_TO_I64)
     ALU1(CONVI64_TO_I)
     I64Shift(I64SHL)
     I64Shift(I64SHR)
     I64Shift(I64ASR)
+#undef ALU0
 #undef ALU1
 #undef ALU1WithTemp
 #undef ALU2
@@ -622,6 +627,8 @@ namespace gbe
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
     void MATH(Reg dst, uint32_t function, Reg src);
+    /*! Encode nullary instructions */
+    void ALU0(SelectionOpcode opcode, Reg dst);
     /*! Encode unary instructions */
     void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
     /*! Encode unary with temp reg instructions */ @@ -1435,6 +1442,11 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
+  void Selection::Opaque::ALU0(SelectionOpcode opcode, Reg dst) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 1, 0);
+    insn->dst(0) = dst;
+  }
+
   void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
     SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
     insn->dst(0) = dst;
@@ -2054,6 +2066,42 @@ namespace gbe
 #define DECL_CTOR(FAMILY, INSN_NUM, COST) \
   FAMILY##Pattern(void) : OneToManyPattern<FAMILY##Pattern, ir::FAMILY>(INSN_NUM, COST) {}
 
+  /*! Nullary instruction patterns */
+  class NullaryInstructionPattern : public SelectionPattern  {
+  public:
+    NullaryInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::NullaryInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::NullaryInstruction &insn = cast<NullaryInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      const Type type = insn.getType();
+      GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+      sel.push();
+      switch (opcode) {
+        case ir::OP_SIMD_SIZE:
+          {
+            const GenRegister src = GenRegister::immud(sel.curr.execWidth);
+            sel.curr.execWidth = 1;
+            sel.MOV(dst, src);
+          }
+          break;
+        case ir::OP_SIMD_ID:
+          sel.SIMD_ID(dst);
+          break;
+        default: NOT_SUPPORTED;
+      }
+      sel.pop();
+      return true;
+    }
+  };
+
   /*! Unary instruction patterns */
   DECL_PATTERN(UnaryInstruction)
   {
@@ -2563,6 +2611,17 @@ namespace gbe
         case OP_UPSAMPLE_LONG:
           sel.UPSAMPLE_LONG(dst, src0, src1);
           break;
+        case OP_SIMD_SHUFFLE:
+          {
+            if (src1.file == GEN_IMMEDIATE_VALUE) {
+              sel.SIMD_SHUFFLE(dst, src0, src1);
+            } else {
+              GenRegister shiftL = GenRegister::udxgrf(sel.curr.execWidth, sel.reg(FAMILY_DWORD));
+              sel.SHL(shiftL, src1, GenRegister::immud(0x2));
+              sel.SIMD_SHUFFLE(dst, src0, shiftL);
+            }
+          }
+          break;
         default: NOT_IMPLEMENTED;
       }
       sel.pop();
@@ -4789,6 +4848,7 @@ namespace gbe
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
+    this->insert<NullaryInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op) diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 09f5aaf..87ccee3 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -77,6 +77,8 @@ DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)  DECL_SELECTION_IR(I64HADD, I64HADDInstruction)  DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)  DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(SIMD_SHUFFLE, BinaryInstruction) 
+DECL_SELECTION_IR(SIMD_ID, NullaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)  DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)  DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction) diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index dc5662f..c4023ec 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -99,6 +99,7 @@ enum gbe_curbe_type {
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
+  GBE_CURBE_LANE_ID,
   GBE_CURBE_SLM_OFFSET,
 };
 
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp index cf5109d..af65ff3 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -176,6 +176,12 @@ namespace ir {
     DECL_THREE_SRC_INSN(MAD);
 #undef DECL_THREE_SRC_INSN
 
+    /*! For all nullary functions */
+    void ALU0(Opcode opcode, Type type, Register dst) {
+      const Instruction insn = gbe::ir::ALU0(opcode, type, dst);
+      this->append(insn);
+    }
+
     /*! For all unary functions */
     void ALU1(Opcode opcode, Type type, Register dst, Register src) {
       const Instruction insn = gbe::ir::ALU1(opcode, type, dst, src); diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 797552f..9c3331b 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -131,6 +131,17 @@ namespace ir {
       Register src[srcNum]; //!< Indices of the sources
     };
 
+    /*! All 0-source arithmetic instructions */
+    class ALIGNED_INSTRUCTION NullaryInstruction : public NaryInstruction<0>
+    {
+    public:
+      NullaryInstruction(Opcode opcode, Type type, Register dst) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+      }
+    };
+
     /*! All 1-source arithmetic instructions */
     class ALIGNED_INSTRUCTION UnaryInstruction : public NaryInstruction<1>
     {
@@ -1305,6 +1316,10 @@ namespace ir {
     }; \
   }
 
+START_INTROSPECTION(NullaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(NullaryInstruction)
+
 START_INTROSPECTION(UnaryInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(UnaryInstruction)
@@ -1532,6 +1547,7 @@ END_FUNCTION(Instruction, Register)
     return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
   }
 
+DECL_MEM_FN(NullaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())  DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())  DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes()) @@ -1586,6 +1602,21 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   // Implements the emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  // For all nullary functions with given opcode  Instruction 
+ ALU0(Opcode opcode, Type type, Register dst) {
+    return internal::NullaryInstruction(opcode, type, dst).convert();  
+ }
+
+  // All unary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst) { \
+    return ALU0(OP_##NAME, type, dst);\
+  }
+
+  DECL_EMIT_FUNCTION(SIMD_SIZE)
+
+#undef DECL_EMIT_FUNCTION
+
   // For all unary functions with given opcode
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src) {
     return internal::UnaryInstruction(opcode, type, dst, src).convert(); @@ -1645,6 +1676,7 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
   DECL_EMIT_FUNCTION(RHADD)
   DECL_EMIT_FUNCTION(I64HADD)
   DECL_EMIT_FUNCTION(I64RHADD)
+  DECL_EMIT_FUNCTION(SIMD_SHUFFLE)
 
 #undef DECL_EMIT_FUNCTION
 
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 24d27aa..6dd3e81 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -198,6 +198,15 @@ namespace ir {
   /*! Output the instruction string in the given stream */
   std::ostream &operator<< (std::ostream &out, const Instruction &proxy);
 
+  /*! Nullary instruction instructions are typed. */  class 
+ NullaryInstruction : public Instruction {
+  public:
+    /*! Get the type manipulated by the instruction */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);  };
+
   /*! Unary instructions are typed. dst and sources share the same type */
   class UnaryInstruction : public Instruction {
   public:
@@ -558,6 +567,12 @@ namespace ir {
   /// All emission functions
   ///////////////////////////////////////////////////////////////////////////
 
+  /*! alu0.type dst */
+  Instruction ALU0(Opcode opcode, Type type, Register dst);  /*! 
+ simd_size.type dst */  Instruction SIMD_SIZE(Type type, Register dst);  
+ /*! simd_id.type dst */  Instruction SIMD_ID(Type type, Register dst);
   /*! alu1.type dst src */
   Instruction ALU1(Opcode opcode, Type type, Register dst, Register src);
   /*! mov.type dst src */
@@ -670,6 +685,8 @@ namespace ir {
   Instruction GT(Type type, Register dst, Register src0, Register src1);
   /*! ord.type dst src0 src1 */
   Instruction ORD(Type type, Register dst, Register src0, Register src1);
+  /*! simd_shuffle.type dst src0 src1 */  Instruction SIMD_SHUFFLE(Type 
+ type, Register dst, Register src0, Register src1);
   /*! BITCAST.{dstType <- srcType} dst src */
   Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
   /*! cvt.{dstType <- srcType} dst src */ diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx index de4abfb..76269bd 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -25,6 +25,8 @@
  * \file instruction.hxx
  * \author Benjamin Segovia <benjamin.segovia at intel.com>
  */
+DECL_INSN(SIMD_SIZE, NullaryInstruction) DECL_INSN(SIMD_ID, 
+NullaryInstruction)
 DECL_INSN(MOV, UnaryInstruction)
 DECL_INSN(COS, UnaryInstruction)
 DECL_INSN(SIN, UnaryInstruction)
@@ -57,6 +59,7 @@ DECL_INSN(BSB, BinaryInstruction)  DECL_INSN(OR, BinaryInstruction)  DECL_INSN(XOR, BinaryInstruction)  DECL_INSN(AND, BinaryInstruction)
+DECL_INSN(SIMD_SHUFFLE, BinaryInstruction)
 DECL_INSN(SEL, SelectInstruction)
 DECL_INSN(EQ, CompareInstruction)
 DECL_INSN(NE, CompareInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp index 2b1ffdb..26c4129 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -66,6 +66,11 @@ namespace ir {
         const uint32_t srcNum = insn.getSrcNum();
         const uint32_t dstNum = insn.getDstNum();
         bool uniform = true;
+
+        //have no way to decide the dst uniform if there is no source
+        if (srcNum == 0)
+          uniform = false;
+
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const Register reg = insn.getSrc(srcID);
           if (!fn.isUniformRegister(reg)) diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 4c272bd..55aedb4 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,6 +43,7 @@ namespace ir {
         "zero", "one",
         "retVal", "slm_offset",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "lane_id",
         "invalid"
     };
 
@@ -86,6 +87,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, laneid, 0);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index 7259d9f..d310128 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,8 +71,9 @@ namespace ir {
     static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
     static const Register printfbptr = Register(28); // printf buffer address .
     static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register invalid = Register(30);  // used for valid comparation.
-    static const uint32_t regNum = 31;             // number of special registers
+    static const Register laneid = Register(30); // printf index buffer address.
+    static const Register invalid = Register(31);  // used for valid comparation.
+    static const uint32_t regNum = 32;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 16f00ee..623affc 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -90,7 +90,7 @@ MACRO(GENERATE_SOURCE_PY _mod)
 	)
 ENDMACRO(GENERATE_SOURCE_PY)
 
-SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math)
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer 
+ocl_math ocl_simd)
 FOREACH(M ${OCL_PY_GENERATED_MODULES})
     GENERATE_HEADER_PY(${M})
     GENERATE_SOURCE_PY(${M})
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index e886670..a53f4c0 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -30,6 +30,7 @@
 #include "ocl_image.h"
 #include "ocl_integer.h"
 #include "ocl_math.h"
+#include "ocl_simd.h"
 #include "ocl_misc.h"
 #include "ocl_printf.h"
 #include "ocl_relational.h"
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index aa3f504..359025b 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -128,14 +128,6 @@ DEF(ulong)
 #undef DEC16
 #undef DEC16X
 
-
-/* Temp to add the SIMD functions here. */ -/////////////////////////////////////////////////////////////////////////////
-// SIMD level function
-/////////////////////////////////////////////////////////////////////////////
-short __gen_ocl_simd_any(short);
-short __gen_ocl_simd_all(short);
-
 struct time_stamp {
   // time tick
   ulong tick;
diff --git a/backend/src/libocl/script/ocl_simd.def b/backend/src/libocl/script/ocl_simd.def
new file mode 100644
index 0000000..ccda619
--- /dev/null
+++ b/backend/src/libocl/script/ocl_simd.def
@@ -0,0 +1,4 @@
+##simd level functions
+floatn __gen_ocl_simd_shuffle(floatn x, uint c) intn 
+__gen_ocl_simd_shuffle(intn x, uint c) uintn 
+__gen_ocl_simd_shuffle(uintn x, uint c)
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
new file mode 100644
index 0000000..b9da5e2
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -0,0 +1,19 @@
+/*
+ * Copyright @ 2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "ocl_simd.h"
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
new file mode 100644
index 0000000..42afc7b
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_SIMD_H__
+#define __OCL_SIMD_H__
+
+#include "ocl_types.h"
+
+///////////////////////////////////////////////////////////////////////
+//////
+// SIMD level function
+///////////////////////////////////////////////////////////////////////
+//////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+uint __gen_ocl_get_simd_size(void);
+uint __gen_ocl_get_simd_id(void);
+
+OVERLOADABLE float __gen_ocl_simd_shuffle(float x, uint c); 
+OVERLOADABLE int __gen_ocl_simd_shuffle(int x, uint c); OVERLOADABLE 
+uint __gen_ocl_simd_shuffle(uint x, uint c);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index bf03a13..4fcb8bb 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2790,10 +2790,17 @@ namespace gbe
       case GEN_OCL_CONV_F32_TO_F16:
       case GEN_OCL_SIMD_ANY:
       case GEN_OCL_SIMD_ALL:
+      case GEN_OCL_SIMD_SHUFFLE:
       case GEN_OCL_READ_TM:
       case GEN_OCL_REGION:
         this->newRegister(&I);
         break;
+      case GEN_OCL_SIMD_SIZE:
+        this->newRegister(&I, NULL, true);
+        break;
+      case GEN_OCL_SIMD_ID:
+        this->newRegister(&I, NULL, false);
+        break;
       case GEN_OCL_PRINTF:
         break;
       default:
@@ -3053,6 +3060,26 @@ namespace gbe
             ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
             break;
           }
+          case GEN_OCL_SIMD_SIZE:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_SIZE, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_ID:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU0(ir::OP_SIMD_ID, getType(ctx, I.getType()), dst);
+            break;
+          }
+          case GEN_OCL_SIMD_SHUFFLE:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.SIMD_SHUFFLE(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
           case GEN_OCL_READ_TM:
           {
             const ir::Register dst = this->getRegister(&I); diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 9536a3c..714a293 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -155,6 +155,10 @@ DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)  DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)  DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
 
+DECL_LLVM_GEN_FUNCTION(SIMD_SIZE, __gen_ocl_get_simd_size) 
+DECL_LLVM_GEN_FUNCTION(SIMD_ID, __gen_ocl_get_simd_id) 
+DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, __gen_ocl_simd_shuffle)
+
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)  DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 253c4f2..3f73de0 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -202,6 +202,14 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_WORK_DIM, work_dim);  #undef UPLOAD
 
+  /* __gen_ocl_get_simd_id needs it */
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LANE_ID, 0)) >= 0) {
+    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
+    uint32_t *laneid = (uint32_t *) (ker->curbe + offset);
+    int32_t i;
+    for (i = 0; i < (int32_t) simd_sz; ++i) laneid[i] = i;  }
+
   /* Write identity for the stack pointer. This is required by the stack pointer
    * computation in the kernel
    */
--
1.9.1


From chuanbo.weng at intel.com  Fri Mar 27 09:33:50 2015
From: chuanbo.weng at intel.com (Chuanbo Weng)
Date: Sat, 28 Mar 2015 00:33:50 +0800
Subject: [Beignet] [PATCH v2 0/3] Patchset of v4l2 buffer sharing.
Message-ID: <1427474030-9127-1-git-send-email-chuanbo.weng@intel.com>

Comparing to version 1 of v4l2 buffer sharing patch, this version add extension
clCloseMemObjectFdIntel, do corresponding change in v4l2 buffer sharing example
and also add document.

Chuanbo Weng (3):
  Add extension clCloseMemObjectFdIntel().
  Add example to show v4l2 buffer sharing with extension    
    clGetMemObjectFdIntel and clCloseMemObjectFdIntel.
  Add document to describe the detials of v4l2 buffer sharing.

 CMakeLists.txt                                     |  35 +-
 docs/Beignet.mdwn                                  |   1 +
 docs/howto/v4l2-buffer-sharing-howto.mdwn          |  67 +++
 examples/CMakeLists.txt                            |  29 +-
 .../v4l2_buffer_sharing/v4l2_buffer_sharing.cpp    | 593 +++++++++++++++++++++
 include/CL/cl_intel.h                              |  11 +-
 kernels/runtime_yuy2_processing.cl                 |  15 +
 src/cl_api.c                                       |  15 +
 src/cl_driver.h                                    |   3 +
 src/cl_driver_defs.c                               |   1 +
 src/cl_mem.c                                       |  32 +-
 src/cl_mem.h                                       |   4 +
 src/intel/intel_driver.c                           |   1 +
 13 files changed, 780 insertions(+), 27 deletions(-)
 create mode 100644 docs/howto/v4l2-buffer-sharing-howto.mdwn
 create mode 100644 examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
 create mode 100644 kernels/runtime_yuy2_processing.cl

-- 
1.9.1


From chuanbo.weng at intel.com  Fri Mar 27 09:34:01 2015
From: chuanbo.weng at intel.com (Chuanbo Weng)
Date: Sat, 28 Mar 2015 00:34:01 +0800
Subject: [Beignet] [PATCH v2 1/3] Add extension clCloseMemObjectFdIntel().
Message-ID: <1427474041-9171-1-git-send-email-chuanbo.weng@intel.com>

We have added extension clGetMemObjectFdIntel to export fd of memory
object,so we have to added corresponding extension to close the fd.

Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 include/CL/cl_intel.h    | 11 ++++++++++-
 src/cl_api.c             | 15 +++++++++++++++
 src/cl_driver.h          |  3 +++
 src/cl_driver_defs.c     |  1 +
 src/cl_mem.c             | 32 ++++++++++++++++++++++++++++++--
 src/cl_mem.h             |  4 ++++
 src/intel/intel_driver.c |  1 +
 7 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 28bcb62..2ab79ad 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -122,7 +122,7 @@ typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImageFromLibvaIntel_fn)(
                              const cl_libva_image * /* info */,
                              cl_int *               /* errcode_ret */);
 
-/* Create buffer from libva's buffer object */
+/*Export memory object's fd*/
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetMemObjectFdIntel(cl_context   /* context */,
                       cl_mem       /* Memory Obejct */,
@@ -133,6 +133,15 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)(
                              cl_mem       /* Memory Obejct */,
                              int*         /* returned fd */);
 
+/*Close memory object's fd*/
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCloseMemObjectFdIntel(cl_context   /* context */,
+                        cl_mem       /* Memory Obejct */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clCloseMemObjectFdIntel_fn)(
+                             cl_context   /* context */,
+                             cl_mem       /* Memory Obejct */);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/cl_api.c b/src/cl_api.c
index 3e72deb..f9efac7 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -3180,6 +3180,7 @@ internal_clGetExtensionFunctionAddress(const char *func_name)
   EXTFUNC(clCreateBufferFromLibvaIntel)
   EXTFUNC(clCreateImageFromLibvaIntel)
   EXTFUNC(clGetMemObjectFdIntel)
+  EXTFUNC(clCloseMemObjectFdIntel)
   return NULL;
 }
 
@@ -3348,3 +3349,17 @@ clGetMemObjectFdIntel(cl_context context,
 error:
   return err;
 }
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCloseMemObjectFdIntel(cl_context context,
+                        cl_mem memobj)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_MEM (memobj);
+
+  err = cl_mem_close_fd(memobj);
+
+error:
+  return err;
+}
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 3f54a27..dd028c0 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -369,6 +369,9 @@ extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
 typedef int (cl_buffer_get_fd_cb)(cl_buffer, int *fd);
 extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
 
+typedef int (cl_buffer_close_fd_cb)(int fd);
+extern cl_buffer_close_fd_cb *cl_buffer_close_fd;
+
 typedef int (cl_buffer_get_tiling_align_cb)(cl_context ctx, uint32_t tiling_mode, uint32_t dim);
 extern cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align;
 
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 9a47210..8018747 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -51,6 +51,7 @@ LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
 LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
 LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
 LOCAL cl_buffer_get_fd_cb *cl_buffer_get_fd = NULL;
+LOCAL cl_buffer_close_fd_cb *cl_buffer_close_fd = NULL;
 LOCAL cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align = NULL;
 
 /* cl_khr_gl_sharing */
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b41ec14..08fb239 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -267,6 +267,8 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->flags = flags;
   mem->is_userptr = 0;
   mem->offset = 0;
+  mem->export_ref = 0;
+  pthread_mutex_init(&mem->export_lock, NULL);
 
   if (sz != 0) {
     /* Pinning will require stricter alignment rules */
@@ -2051,7 +2053,33 @@ cl_mem_get_fd(cl_mem mem,
               int* fd)
 {
   cl_int err = CL_SUCCESS;
-  if(cl_buffer_get_fd(mem->bo, fd))
-	err = CL_INVALID_OPERATION;
+
+  pthread_mutex_lock(&mem->export_lock);
+  if(mem->export_ref == 0){
+    if(cl_buffer_get_fd(mem->bo, fd))
+      err = CL_INVALID_OPERATION;
+    mem->export_fd = *fd;
+  }
+  else{
+    *fd = mem->export_fd;
+  }
+  mem->export_ref++;
+  pthread_mutex_unlock(&mem->export_lock);
+
+  return err;
+}
+
+LOCAL cl_int
+cl_mem_close_fd(cl_mem mem)
+{
+  cl_int err = CL_SUCCESS;
+
+  if(mem->export_ref == 0)
+    return CL_INVALID_MEM_OBJECT;
+  if (atomic_dec(&mem->export_ref) > 1)
+    return CL_SUCCESS;
+  if(cl_buffer_close_fd(mem->export_fd))
+    err = CL_INVALID_OPERATION;
+
   return err;
 }
diff --git a/src/cl_mem.h b/src/cl_mem.h
index e027f15..aea2ade 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -95,6 +95,9 @@ typedef  struct _cl_mem {
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
   uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled*/
   size_t offset;            /* offset of host_ptr to the page beginning, only for CL_MEM_USE_HOST_PTR*/
+  pthread_mutex_t export_lock; /* To export fd */
+  volatile int export_ref;  /* The exported count */
+  int export_fd;            /* The exported fd of this memory object */
 } _cl_mem;
 
 struct _cl_mem_image {
@@ -294,6 +297,7 @@ extern cl_mem cl_mem_new_libva_image(cl_context ctx,
                                      size_t row_pitch,
                                      cl_int *errcode);
 extern cl_int cl_mem_get_fd(cl_mem mem, int* fd);
+extern cl_int cl_mem_close_fd(cl_mem mem);
 
 
 #endif /* __CL_MEM_H__ */
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 755ab6b..37fb262 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -840,6 +840,7 @@ intel_setup_callbacks(void)
   cl_buffer_get_subdata = (cl_buffer_get_subdata_cb *) drm_intel_bo_get_subdata;
   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
   cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
+  cl_buffer_close_fd = (cl_buffer_close_fd_cb *) close;
   cl_buffer_get_tiling_align = (cl_buffer_get_tiling_align_cb *)intel_buffer_get_tiling_align;
   intel_set_gpgpu_callbacks(intel_get_device_id());
 }
-- 
1.9.1


From chuanbo.weng at intel.com  Fri Mar 27 09:34:08 2015
From: chuanbo.weng at intel.com (Chuanbo Weng)
Date: Sat, 28 Mar 2015 00:34:08 +0800
Subject: [Beignet] [PATCH v2 2/3] Add example to show v4l2 buffer sharing
	with extension clGetMemObjectFdIntel and clCloseMemObjectFdIntel.
Message-ID: <1427474048-9215-1-git-send-email-chuanbo.weng@intel.com>

This example captures yuy2 frame directly to cl buffer object by the way
of dma, processed by OpenCL kernel, then convert to nv12 format and
shown by libva.

v2:
Close cl buffer's fd by clCloseMemObjectFdIntel instead of close
function.

Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 CMakeLists.txt                                     |  35 +-
 examples/CMakeLists.txt                            |  29 +-
 .../v4l2_buffer_sharing/v4l2_buffer_sharing.cpp    | 593 +++++++++++++++++++++
 kernels/runtime_yuy2_processing.cl                 |  15 +
 4 files changed, 648 insertions(+), 24 deletions(-)
 create mode 100644 examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
 create mode 100644 kernels/runtime_yuy2_processing.cl

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5474447..4f627cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -216,23 +216,30 @@ IF(BUILD_EXAMPLES)
 IF(NOT X11_FOUND)
   MESSAGE(FATAL_ERROR "XLib is necessary for examples - not found")
 ENDIF(NOT X11_FOUND)
-# libva
-pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
-IF(LIBVA_FOUND)
+# libva & libva-x11
+#pkg_check_modules(LIBVA REQUIRED libva>=0.36.0)
+pkg_check_modules(LIBVA REQUIRED libva)
+pkg_check_modules(LIBVA-X11 REQUIRED libva-x11)
+set(LIBVA_BUF_SH_DEP false)
+set(V4L2_BUF_SH_DEP false)
+IF(LIBVA_FOUND AND LIBVA-X11_FOUND)
   MESSAGE(STATUS "Looking for LIBVA - found at ${LIBVA_PREFIX} ${LIBVA_VERSION}")
-  INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS})
-ELSE(LIBVA_FOUND)
-  MESSAGE(STATUS "Looking for LIBVA (>= 0.36.0) - not found")
-ENDIF(LIBVA_FOUND)
-
-# libva-x11
-pkg_check_modules(LIBVA-X11 REQUIRED libva-x11>=0.36.0)
-IF(LIBVA-X11_FOUND)
   MESSAGE(STATUS "Looking for LIBVA-X11 - found at ${LIBVA-X11_PREFIX} ${LIBVA-X11_VERSION}")
+  INCLUDE_DIRECTORIES(${LIBVA_INCLUDE_DIRS})
   INCLUDE_DIRECTORIES(${LIBVA-X11_INCLUDE_DIRS})
-ELSE(LIBVA-X11_FOUND)
-  MESSAGE(STATUS "Looking for LIBVA-X11 (>= 0.36.0) - not found")
-ENDIF(LIBVA-X11_FOUND)
+  set(V4L2_BUF_SH_DEP true)
+  IF(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    IF(LIBVA_VERSION VERSION_LESS "0.36.0")
+      MESSAGE(STATUS "Looking for LIBVA (>= 0.36.0) - not found")
+    ENDIF(LIBVA_VERSION VERSION_LESS "0.36.0")
+    IF(LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+      MESSAGE(STATUS "Looking for LIBVA-X11 (>= 0.36.0) - not found")
+    ENDIF(LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    MESSAGE(STATUS "Example libva_buffer_sharing will not be built")
+  ELSE(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+    set(LIBVA_BUF_SH_DEP true)
+  ENDIF(LIBVA_VERSION VERSION_LESS "0.36.0" OR LIBVA-X11_VERSION VERSION_LESS "0.36.0")
+ENDIF(LIBVA_FOUND AND LIBVA-X11_FOUND)
 ENDIF(BUILD_EXAMPLES)
 
 ADD_SUBDIRECTORY(include)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 904f259..ab31fe7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,3 +1,9 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+                    ${X11_INCLUDE_DIR})
+
+IF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP)
 EXEC_PROGRAM(ls ARGS "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" OUTPUT_VARIABLE LS_OUTPUT)
 IF(NOT LS_OUTPUT)
 EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/.." ARGS "submodule init")
@@ -5,17 +11,13 @@ EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/.." ARGS "submodule update")
 EXEC_PROGRAM(git "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva" ARGS "checkout master")
 ENDIF(NOT LS_OUTPUT)
 
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
-                    ${CMAKE_CURRENT_SOURCE_DIR}/../utests
-                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
-                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va
-                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common
-                    ${X11_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/va
+                    ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/libva/test/common)
 
 link_directories (${LIBVA_LIBDIR}
                   ${LIBVA-X11_LIBDIR})
 
-set (examples_sources
+set (va_ocl_basic_sources
   ../utests/utest_error.c
   ../utests/utest_assert.cpp
   ../utests/utest_file_map.cpp
@@ -23,13 +25,20 @@ set (examples_sources
   ./thirdparty/libva/test/common/va_display.c
   ./thirdparty/libva/test/common/va_display_x11.c)
 
-
 ADD_DEFINITIONS(-DHAVE_VA_X11)
-ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT="${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12")
 
-ADD_LIBRARY(va_ocl_basic SHARED ${examples_sources})
+ADD_LIBRARY(va_ocl_basic SHARED ${va_ocl_basic_sources})
 
 TARGET_LINK_LIBRARIES(va_ocl_basic cl m va va-x11 ${X11_X11_LIB})
 
+IF(LIBVA_BUF_SH_DEP)
+ADD_DEFINITIONS(-DINPUT_NV12_DEFAULT="${CMAKE_CURRENT_SOURCE_DIR}/libva_buffer_sharing/256_128.nv12")
 ADD_EXECUTABLE(example-libva_buffer_sharing ./libva_buffer_sharing/libva_buffer_sharing.cpp)
 TARGET_LINK_LIBRARIES(example-libva_buffer_sharing va_ocl_basic)
+ENDIF(LIBVA_BUF_SH_DEP)
+
+IF(V4L2_BUF_SH_DEP)
+ADD_EXECUTABLE(example-v4l2_buffer_sharing ./v4l2_buffer_sharing/v4l2_buffer_sharing.cpp)
+TARGET_LINK_LIBRARIES(example-v4l2_buffer_sharing va_ocl_basic)
+ENDIF(V4L2_BUF_SH_DEP)
+ENDIF(LIBVA_BUF_SH_DEP OR V4L2_BUF_SH_DEP)
diff --git a/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp b/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
new file mode 100644
index 0000000..e1ced99
--- /dev/null
+++ b/examples/v4l2_buffer_sharing/v4l2_buffer_sharing.cpp
@@ -0,0 +1,593 @@
+/*
+ ** Copyright (c) 2012, 2015 Intel Corporation. All Rights Reserved.
+ **
+ ** Permission is hereby granted, free of charge, to any person obtaining a
+ ** copy of this software and associated documentation files (the
+ ** "Software"), to deal in the Software without restriction, including
+ ** without limitation the rights to use, copy, modify, merge, publish,
+ ** distribute, sub license, and/or sell copies of the Software, and to
+ ** permit persons to whom the Software is furnished to do so, subject to
+ ** the following conditions:
+ **
+ ** The above copyright notice and this permission notice (including the
+ ** next paragraph) shall be included in all copies or substantial portions
+ ** of the Software.
+ **
+ ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ ** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ ** IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ ** ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ **/
+
+#include <getopt.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/videodev2.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <inttypes.h>
+#include <ctype.h>
+
+#include <va/va.h>
+#include <va/va_drmcommon.h>
+
+#include "va_display.h"
+#include "utest_helper.hpp"
+
+using namespace std;
+
+#define BUFFER_NUM_DEFAULT 5
+#define VIDEO_NODE_DEFAULT "/dev/video0"
+#define WIDTH_DEFAULT 640
+#define HEIGHT_DEFAULT 480
+
+#define CHECK_VASTATUS(va_status,func)                                  \
+  if (va_status != VA_STATUS_SUCCESS) {                                   \
+    fprintf(stderr, "status = %d, %s: %s(line %d) failed, exit\n",va_status, __func__, func, __LINE__); \
+    exit(1);                                                            \
+  }
+
+#define CHECK_CLSTATUS(status,func)                                  \
+  if (status != CL_SUCCESS) {                                   \
+    fprintf(stderr, "status = %d, %s: %s(line %d) failed, exit\n", status, __func__, func, __LINE__); \
+    exit(1);                                                            \
+  }
+
+#define CHECK_V4L2ERROR(ret, STR)                               \
+  if (ret){                             \
+    fprintf(stderr, STR);            \
+    perror(" ");                            \
+    fprintf(stderr, "ret = %d, %s: %s(line %d) failed, exit\n", ret, __func__, STR, __LINE__);      \
+    exit(1);                                  \
+  }
+
+VADisplay	va_dpy;
+cl_int cl_status;
+VAStatus va_status;
+VASurfaceID nv12_surface_id;
+VAImage nv12_image;
+
+int dev_fd;
+uint64_t image_size;
+unsigned int pitch;
+cl_mem *import_buf = NULL;
+typedef cl_int (OCLGETMEMOBJECTFD)(cl_context, cl_mem, int *);
+OCLGETMEMOBJECTFD *oclGetMemObjectFd = NULL;
+typedef cl_int (OCLCLOSEMEMOBJECTFD)(cl_context, cl_mem);
+OCLCLOSEMEMOBJECTFD *oclCloseMemObjectFd = NULL;
+
+int frame_count = 0;
+struct v4l2_options{
+  const char *dev_name;
+  unsigned int width, height;
+  unsigned int spec_res;
+  unsigned int buffer_num;
+  unsigned int do_list;
+} vo;
+int *import_buf_fd = NULL;
+
+static const char short_options[] = "d:r:b:lh";
+
+static const struct option
+long_options[] = {
+  { "device", required_argument, NULL, 'd' },
+  { "help",   no_argument,       NULL, 'h' },
+  { "resolution", required_argument,       NULL, 'r' },
+  { "buffer_num",  required_argument, NULL, 'b' },
+  { "list",  no_argument, NULL, 'l' },
+  { 0, 0, 0, 0 }
+};
+
+static void usage(FILE *fp, int argc, char **argv)
+{
+  fprintf(fp,
+      "Usage: %s [options]\n\n"
+      "Options:\n"
+      "-d | --device=<dev>  Specify device by <dev> instead of /dev/video0\n"
+      "-h | --help          Print this message\n"
+      "-r | --resolution=<width,height>    Set image resolution\n"
+      "-b | --buffer_num=<num>  Set number of buffers\n"
+      "-l | --list  List available resolution of format 'V4L2_PIX_FMT_YUYV'\n"
+      "",
+      argv[0]);
+}
+
+static void list_resolution(){
+  int ret;
+  struct v4l2_capability cap;
+  struct v4l2_frmsizeenum frm_sz;
+
+  dev_fd = open(vo.dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (dev_fd < 0) {
+    fprintf(stderr, "Can not open %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+
+  memset(&cap, 0, sizeof(cap));
+  ret = ioctl(dev_fd, VIDIOC_QUERYCAP, &cap);
+  CHECK_V4L2ERROR(ret, "VIDIOC_QUERYCAP");
+
+  if(!(cap.capabilities & V4L2_CAP_VIDEO_CAPTURE)){
+    fprintf(stderr, "The device is not video capture device\n");
+    exit(1);
+  }
+  if(!(cap.capabilities & V4L2_CAP_STREAMING)){
+    fprintf(stderr, "The device does not support streaming i/o\n");
+    exit(1);
+  }
+
+  printf("Supported resolution under pixel format 'V4L2_PIX_FMT_YUYV':\n");
+  frm_sz.pixel_format = V4L2_PIX_FMT_YUYV;
+  frm_sz.index = 0;
+  bool extra_info = true;
+  while (ioctl(dev_fd, VIDIOC_ENUM_FRAMESIZES, &frm_sz) == 0) {
+    if (frm_sz.type == V4L2_FRMSIZE_TYPE_DISCRETE) {
+      if(extra_info){
+        printf("(width, height) = \n");
+        extra_info = false;
+      }
+      printf("(%d, %d)", frm_sz.discrete.width, frm_sz.discrete.height);
+      printf("\n");
+    }
+    else if (frm_sz.type == V4L2_FRMSIZE_TYPE_STEPWISE) {
+      printf("(width, height) from (%d, %d) to (%d, %d) with step (%d, %d)",
+          frm_sz.stepwise.min_width,
+          frm_sz.stepwise.min_height,
+          frm_sz.stepwise.max_width,
+          frm_sz.stepwise.max_height,
+          frm_sz.stepwise.step_width,
+          frm_sz.stepwise.step_height);
+      continue;
+    }
+    frm_sz.index++;
+  }
+
+  ret = close(dev_fd);
+  if (ret) {
+    fprintf(stderr, "Failed to close %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+}
+
+static void analyse_args(int argc, char *argv[])
+{
+  vo.dev_name = NULL;
+  vo.width = 0;
+  vo.height = 0;
+  vo.spec_res = 0;
+  vo.buffer_num = BUFFER_NUM_DEFAULT;
+  vo.do_list = 0;
+
+  int c, idx;
+  for (;;) {
+
+    c = getopt_long(argc, argv,
+        short_options, long_options, &idx);
+
+    if (-1 == c)
+      break;
+
+    switch (c) {
+      case 0:
+        break;
+
+      case 'd':
+        vo.dev_name = optarg;
+        break;
+
+      case '?':
+      case 'h':
+        usage(stdout, argc, argv);
+        exit(0);
+
+      case 'r':
+        sscanf(optarg, "%d,%d", &vo.width, &vo.height);
+        vo.spec_res = 1;
+        break;
+
+      case 'b':
+        vo.buffer_num = strtoul(optarg, NULL, 0);
+        break;
+
+      case 'l':
+        vo.do_list = 1;
+        break;
+
+      default:
+        usage(stderr, argc, argv);
+        exit(1);
+    }
+  }
+
+  if(!vo.dev_name){
+    printf("Haven't specified device, use default device: %s\n",
+        VIDEO_NODE_DEFAULT);
+  }
+  if(!vo.dev_name)
+    vo.dev_name = VIDEO_NODE_DEFAULT;
+  if(vo.do_list){
+    list_resolution();
+    exit(0);
+  }
+  if(!vo.spec_res){
+    printf("Haven't specified resolution, use default resolution: (width,height) = (%d, %d)\n",
+        WIDTH_DEFAULT, HEIGHT_DEFAULT);
+    vo.width = WIDTH_DEFAULT;
+    vo.height = HEIGHT_DEFAULT;
+  }
+  return;
+}
+
+static void initialize_va_ocl(){
+  int major_ver, minor_ver;
+
+  printf("\n***********************libva info: ***********************\n");
+  fflush(stdout);
+  va_dpy = va_open_display();
+  va_status = vaInitialize(va_dpy, &major_ver, &minor_ver);
+  CHECK_VASTATUS(va_status, "vaInitialize");
+
+  VASurfaceAttrib forcc;
+  forcc.type =VASurfaceAttribPixelFormat;
+  forcc.flags=VA_SURFACE_ATTRIB_SETTABLE;
+  forcc.value.type=VAGenericValueTypeInteger;
+  forcc.value.value.i = VA_FOURCC_NV12;
+  va_status = vaCreateSurfaces(va_dpy, VA_RT_FORMAT_YUV420,
+                               vo.width, vo.height,
+                               &nv12_surface_id, 1, &forcc, 1);
+  CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+  VAImageFormat image_fmt;
+  image_fmt.fourcc = VA_FOURCC_NV12;
+  image_fmt.byte_order = VA_LSB_FIRST;
+  image_fmt.bits_per_pixel = 12;
+  va_status = vaCreateImage(va_dpy, &image_fmt, vo.width, vo.height, &nv12_image);
+  CHECK_VASTATUS(va_status, "vaCreateImage");
+
+  //ocl initialization: basic & create kernel & get extension
+  printf("\n***********************OpenCL info: ***********************\n");
+  if ((cl_status = cl_test_init("runtime_yuy2_processing.cl", "runtime_yuy2_processing", SOURCE)) != 0){
+    fprintf(stderr, "cl_test_init error\n");
+    exit(1);
+  }
+
+#ifdef CL_VERSION_1_2
+  oclGetMemObjectFd = (OCLGETMEMOBJECTFD *)clGetExtensionFunctionAddressForPlatform(platform, "clGetMemObjectFdIntel");
+  oclCloseMemObjectFd = (OCLCLOSEMEMOBJECTFD *)clGetExtensionFunctionAddressForPlatform(platform, "clCloseMemObjectFdIntel");
+#else
+  oclGetMemObjectFd = (OCLGETMEMOBJECTFD *)clGetExtensionFunctionAddress("clGetMemObjectFdIntel");
+  oclCloseMemObjectFd = (OCLCLOSEMEMOBJECTFD *)clGetExtensionFunctionAddress("clCloseMemObjectFdIntel");
+#endif
+  if(!oclGetMemObjectFd){
+    fprintf(stderr, "Failed to get extension clGetMemObjectFdIntel\n");
+    exit(1);
+  }
+  if(!oclCloseMemObjectFd){
+    fprintf(stderr, "Failed to get extension clCloseMemObjectFdIntel\n");
+    exit(1);
+  }
+  printf("\n***********************************************************\n");
+}
+
+static void create_dmasharing_buffers()
+{
+  if(import_buf_fd == NULL)
+    import_buf_fd = (int *)malloc(sizeof(int) * vo.buffer_num);
+  if(import_buf == NULL){
+    import_buf = (cl_mem *)malloc(sizeof(cl_mem) * vo.buffer_num);
+  }
+
+  for (unsigned int i = 0; i < vo.buffer_num; ++i){
+    import_buf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE, image_size, NULL, &cl_status);
+    CHECK_CLSTATUS(cl_status, "clCreateBuffer");
+
+    //get cl buffer object's fd
+    cl_status = oclGetMemObjectFd(ctx, import_buf[i], &import_buf_fd[i]);
+    CHECK_CLSTATUS(cl_status, "clGetMemObjectFdIntel");
+  }
+}
+
+static void release_va_ocl(){
+  va_status = vaDestroySurfaces(va_dpy,&nv12_surface_id,1);
+  CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+  va_status = vaDestroyImage(va_dpy, nv12_image.image_id);
+  CHECK_VASTATUS(va_status, "vaDestroyImage");
+  va_status = vaTerminate(va_dpy);
+  CHECK_VASTATUS(va_status, "vaTerminate");
+  va_close_display(va_dpy);
+
+  for (unsigned int i = 0; i < vo.buffer_num; ++i) {
+    cl_status = oclCloseMemObjectFd(ctx, import_buf[i]);
+    CHECK_CLSTATUS(cl_status, "clCloseMemObjectFdIntel");
+    cl_status = clReleaseMemObject(import_buf[i]);
+    CHECK_CLSTATUS(cl_status, "clReleaseMemObject");
+  }
+}
+
+static void process_show_frame(int index)
+{
+  //process import_buf[index] by ocl
+  size_t global_size[2];
+  global_size[0] = vo.width * 2 / 4;
+  global_size[1] = vo.height;
+  cl_status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &import_buf[index]);
+  CHECK_CLSTATUS(cl_status, "clSetKernelArg");
+  cl_status = clSetKernelArg(kernel, 1, sizeof(int), &vo.height);
+  CHECK_CLSTATUS(cl_status, "clSetKernelArg");
+  cl_status = clSetKernelArg(kernel, 2, sizeof(int), &pitch);
+  CHECK_CLSTATUS(cl_status, "clSetKernelArg");
+  cl_status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+                                     global_size, NULL, 0, NULL, NULL);
+  CHECK_CLSTATUS(cl_status, "clEnqueueNDRangeKernel");
+  cl_status = clFinish(queue);
+  CHECK_CLSTATUS(cl_status, "clFinish");
+
+  //create corresponding VASurface
+  VASurfaceID yuy2_surface_id;
+  VASurfaceAttrib sa[2];
+  sa[0].type = VASurfaceAttribMemoryType;
+  sa[0].flags = VA_SURFACE_ATTRIB_SETTABLE;
+  sa[0].value.type = VAGenericValueTypeInteger;
+  sa[0].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+  sa[1].type = VASurfaceAttribExternalBufferDescriptor;
+  sa[1].flags = VA_SURFACE_ATTRIB_SETTABLE;
+  sa[1].value.type = VAGenericValueTypePointer;
+  VASurfaceAttribExternalBuffers sa_eb;
+  sa_eb.pixel_format = VA_FOURCC_YUY2;
+  sa_eb.width = vo.width;
+  sa_eb.height = vo.height;
+  sa_eb.data_size = image_size;
+  sa_eb.num_planes = 1;
+  sa_eb.pitches[0] = pitch;
+  sa_eb.offsets[0] = 0;
+  sa_eb.num_buffers = 1;
+  sa_eb.buffers = (unsigned long *)malloc(sizeof(unsigned long) * sa_eb.num_buffers);
+  sa_eb.buffers[0] = import_buf_fd[index];
+  sa_eb.flags = 0;
+  sa[1].value.value.p = &sa_eb;
+  va_status = vaCreateSurfaces(va_dpy, VA_RT_FORMAT_YUV422,
+                               vo.width, vo.height,
+                               &yuy2_surface_id, 1, sa, 2);
+  CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+  //convert to NV12 format
+  va_status = vaGetImage (va_dpy, yuy2_surface_id, 0, 0,
+                          vo.width, vo.height, nv12_image.image_id);
+  CHECK_VASTATUS(va_status, "vaGetImage");
+  va_status = vaPutImage(va_dpy, nv12_surface_id, nv12_image.image_id,
+                         0, 0, vo.width, vo.height, 0, 0,
+                         vo.width, vo.height);
+  CHECK_VASTATUS(va_status, "vaPutImage");
+
+  //show by vaPutsurface
+  VARectangle src_rect, dst_rect;
+  src_rect.x      = 0;
+  src_rect.y      = 0;
+  src_rect.width  = vo.width;
+  src_rect.height = vo.height;
+  dst_rect        = src_rect;
+  va_status = va_put_surface(va_dpy, nv12_surface_id, &src_rect, &dst_rect);
+  CHECK_VASTATUS(va_status, "vaPutSurface");
+
+  vaDestroySurfaces(va_dpy,&yuy2_surface_id,1);
+  CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+  free(sa_eb.buffers);
+  return;
+}
+
+static void init_dmabuf(void){
+  int ret;
+  struct v4l2_requestbuffers reqbuf;
+
+  memset(&reqbuf, 0, sizeof(reqbuf));
+  reqbuf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  reqbuf.memory = V4L2_MEMORY_DMABUF;
+  reqbuf.count = vo.buffer_num;
+
+  ret = ioctl(dev_fd, VIDIOC_REQBUFS, &reqbuf);
+  if(ret == -1 && errno == EINVAL){
+    fprintf(stderr, "Video capturing or DMABUF streaming is not supported\n");
+    exit(1);
+  }
+  else
+    CHECK_V4L2ERROR(ret, "VIDIOC_REQBUFS");
+
+  create_dmasharing_buffers();
+  printf("Succeed to create %d dma buffers \n", vo.buffer_num);
+
+}
+
+static void init_device(void){
+
+  int ret;
+  struct v4l2_capability cap;
+  struct v4l2_format format;
+
+  dev_fd = open(vo.dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (dev_fd < 0) {
+    fprintf(stderr, "Can not open %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+
+  memset(&cap, 0, sizeof(cap));
+  ret = ioctl(dev_fd, VIDIOC_QUERYCAP, &cap);
+  CHECK_V4L2ERROR(ret, "VIDIOC_QUERYCAP");
+  if(!(cap.capabilities & V4L2_CAP_STREAMING)){
+    fprintf(stderr, "The device does not support streaming i/o\n");
+    exit(1);
+  }
+
+  memset(&format, 0, sizeof(format));
+  format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  format.fmt.pix.width = vo.width;
+  format.fmt.pix.height = vo.height;
+  format.fmt.pix.pixelformat = V4L2_PIX_FMT_YUYV;
+  format.fmt.pix.field = V4L2_FIELD_ANY;
+
+  ret = ioctl(dev_fd, VIDIOC_S_FMT, &format);
+  CHECK_V4L2ERROR(ret, "VIDIOC_S_FMT");
+
+  ret = ioctl(dev_fd, VIDIOC_G_FMT, &format);
+  CHECK_V4L2ERROR(ret, "VIDIOC_G_FMT");
+  if(format.fmt.pix.pixelformat != V4L2_PIX_FMT_YUYV){
+    fprintf(stderr, "V4L2_PIX_FMT_YUYV format is not supported by %s\n", vo.dev_name);
+    exit(1);
+  }
+  if(format.fmt.pix.width != vo.width  || format.fmt.pix.height != vo.height){
+    fprintf(stderr, "This resolution is not supported, please go through supported resolution by command './main -l'\n");
+    exit(1);
+  }
+  printf("Input image format: (width, height) = (%u, %u), pixel format = %.4s\n",
+      format.fmt.pix.width, format.fmt.pix.height, (char*)&format.fmt.pix.pixelformat);
+  image_size = format.fmt.pix.sizeimage;
+	pitch = format.fmt.pix.bytesperline;
+}
+
+static void start_capturing(void){
+  int ret;
+  for (unsigned int i = 0; i < vo.buffer_num; ++i) {
+    struct v4l2_buffer buf;
+
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    buf.index = i;
+    buf.m.fd = import_buf_fd[i];
+    ret = ioctl(dev_fd, VIDIOC_QBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_QBUF");
+  }
+
+  int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  ret = ioctl(dev_fd, VIDIOC_STREAMON, &type);
+  CHECK_V4L2ERROR(ret, "VIDIOC_STREAMON");
+}
+
+static void mainloop(void){
+  int ret;
+  struct v4l2_buffer buf;
+  int index;
+
+  while (1) {
+    frame_count++;
+    printf("******************Frame %d\n", frame_count);
+    fd_set fds;
+    struct timeval tv;
+    int r;
+
+    FD_ZERO(&fds);
+    FD_SET(dev_fd, &fds);
+
+    /* Timeout. */
+    tv.tv_sec = 2;
+    tv.tv_usec = 0;
+
+
+    r = select(dev_fd + 1, &fds, NULL, NULL, &tv);
+
+    if (-1 == r) {
+      if (EINTR == errno)
+        continue;
+      perror("select");
+    }
+
+    if(r == 0){
+      fprintf(stderr, "Select timeout\n");
+      exit(1);
+    }
+
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    ret = ioctl(dev_fd, VIDIOC_DQBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_DQBUF");
+    index = buf.index;
+
+    //process by ocl and show on screen by libva
+    process_show_frame(index);
+
+    //Then queue this buffer(buf.index) by QBUF
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_DMABUF;
+    buf.m.fd = import_buf_fd[index];
+    buf.index = index;
+
+    ret = ioctl(dev_fd, VIDIOC_QBUF, &buf);
+    CHECK_V4L2ERROR(ret, "VIDIOC_QBUF");
+  }
+}
+
+static void stop_capturing(void)
+{
+  int ret;
+  int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+
+  ret = ioctl(dev_fd, VIDIOC_STREAMOFF, &type);
+  CHECK_V4L2ERROR(ret, "VIDIOC_STREAMOFF");
+}
+
+static void uninit_device(void){
+  free(import_buf_fd);
+  free(import_buf);
+  int ret = close(dev_fd);
+  if (ret) {
+    fprintf(stderr, "Failed to close %s: %s\n",
+        vo.dev_name, strerror(errno));
+    exit(1);
+  }
+}
+
+int main(int argc, char *argv[])
+{
+  analyse_args(argc, argv);
+
+  init_device();
+  initialize_va_ocl();
+  init_dmabuf();
+
+  start_capturing();
+  mainloop();
+
+  stop_capturing();
+  release_va_ocl();
+  uninit_device();
+
+  return 0;
+}
diff --git a/kernels/runtime_yuy2_processing.cl b/kernels/runtime_yuy2_processing.cl
new file mode 100644
index 0000000..1478e65
--- /dev/null
+++ b/kernels/runtime_yuy2_processing.cl
@@ -0,0 +1,15 @@
+__kernel void
+runtime_yuy2_processing(__global uchar *src,
+                        int image_height,
+                        int image_pitch)
+{
+  int gx = get_global_id(0);
+  int gy = get_global_id(1);
+
+  int src_y = image_height / 2 + gy;
+  int mirror_y = image_height - src_y;
+
+  uchar4 mirror_val = *(__global uchar4*)(src + mirror_y*image_pitch + gx*4);
+  *(__global uchar4*)(src + src_y*image_pitch + gx*4) = mirror_val;
+
+}
-- 
1.9.1


From chuanbo.weng at intel.com  Fri Mar 27 09:34:17 2015
From: chuanbo.weng at intel.com (Chuanbo Weng)
Date: Sat, 28 Mar 2015 00:34:17 +0800
Subject: [Beignet] [PATCH v2 3/3] Add document to describe the detials of
	v4l2 buffer sharing.
Message-ID: <1427474057-9259-1-git-send-email-chuanbo.weng@intel.com>

This document includes the steps of using DMABUF buffer sharing between
v4l2 and Beignet. Also steps to run corresponding example.

Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 docs/Beignet.mdwn                         |  1 +
 docs/howto/v4l2-buffer-sharing-howto.mdwn | 67 +++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 docs/howto/v4l2-buffer-sharing-howto.mdwn

diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index aacd7d2..31c0d9a 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -254,6 +254,7 @@ Documents for OpenCL application developers
 - [[Work with old system without c++11|Beignet/howto/oldgcc-howto]]
 - [[Kernel Optimization Guide|Beignet/optimization-guide]]
 - [[Libva Buffer Sharing|Beignet/howto/libva-buffer-sharing-howto]]
+- [[V4l2 Buffer Sharing|Beignet/howto/v4l2-buffer-sharing-howto]]
 
 The wiki URL is as below:
 [http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/howto/v4l2-buffer-sharing-howto.mdwn b/docs/howto/v4l2-buffer-sharing-howto.mdwn
new file mode 100644
index 0000000..d5a9b56
--- /dev/null
+++ b/docs/howto/v4l2-buffer-sharing-howto.mdwn
@@ -0,0 +1,67 @@
+V4l2 Buffer Sharing HowTo
+=========================
+
+Beignet has extensions (clGetMemObjectFdIntel/clCloseMemObjectFdIntel) to share gpu
+buffer object with v4l2. So users can utilize OpenCL to do processing on input/ouput
+buffers of v4l2 device without buffer copy.
+
+Prerequisite
+------------
+
+Linux kernel supports DMABUF buffer sharing for v4l2 from version 3.8. DMABUF buffer
+sharing runs well for V4L2_PIX_FMT_MJPEG format on this version, but there is a bug
+for V4L2_PIX_FMT_YUYV format. Linux kernel 3.19.0-rc1 fix this bug, so please use kernel
+version 3.19.0-rc1 at least if you want to utilize this feature for V4L2_PIX_FMT_YUYV
+format.
+
+Steps
+-----
+
+The below official v4l2 document describes the details of sharing DMA buffers between
+v4l devices and other devices using v4l2 as a DMABUF importer:
+[http://linuxtv.org/downloads/v4l-dvb-apis/dmabuf.html](http://linuxtv.org/downloads/v4l-dvb-apis/dmabuf.html)
+Beignet has added extensions(clGetMemObjectFdIntel/clCloseMemObjectFdIntel) to support
+this mechanism. Please follow the steps as below to utilize DMABUF buffer sharing between
+v4l devices and Beignet:
+
+- Get the address of this extension by the function:
+  clGetExtensionFunctionAddress("clGetMemObjectFdIntel")
+  and clGetExtensionFunctionAddress("clCloseMemObjectFdIntel")
+
+- Create a number of cl buffer objects, invoke clGetMemObjectFdIntel to get these buffer
+  objects' file descriptors.
+
+- Initiating streaming I/O with DMABUF buffer sharing by calling the VIDIOC_REQBUFS v4l2 ioctl.
+
+- Enqueue these buffers by calling the VIDIOC_QBUF, dequeue a buffer by calling VIDIOC_DQBUF,
+  use OpenCL to do processing on this buffer and re-enqueue...
+
+- Close file descriptors of these buffers by clCloseMemObjectFdIntel if your program doesn't
+  need DMABUF buffer sharing anymore.
+
+Sample code
+-----------
+
+We have developed an example showing how to share DMA buffers between webcam and Beignet in
+examples/v4l2_buffer_sharing directory. The webcam directly captures V4L2_PIX_FMT_YUYV frames
+into cl buffer objects by the way of DMABUF buffer sharing, then frames are got mirror effect
+by OpenCL kernel, and finally show on screen by libva.
+
+Steps to build and run this example:
+
+- Update your linux kernel to at least 3.19.0-rc1.
+
+- Make sure there is a webcam connected to your pc.
+
+- Add option -DBUILD_EXAMPLES=ON to enable building examples when running cmake, such as:
+  `> mkdir build`
+  `> cd build`
+  `> cmake -DBUILD_EXAMPLES=ON ../`
+
+- Build source code:
+  `> make`
+
+- Run:
+  `> cd examples`
+  `> . ../utests/setenv.sh`
+  `> ./example-v4l2_buffer_sharing`
-- 
1.9.1


From rong.r.yang at intel.com  Sun Mar 29 20:23:38 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Mon, 30 Mar 2015 11:23:38 +0800
Subject: [Beignet] [PATCH] BDW: Refine unpacked_ud in the gen8_context.cpp.
Message-ID: <1427685818-3820-1-git-send-email-rong.r.yang@intel.com>

Add a function unpacked_ud to handle unpacked_ud from long.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen8_context.cpp | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 3cdf62e..920eb3e 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -320,6 +320,17 @@ namespace gbe
     GBE_ASSERT(0);
   }
 
+  static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0)
+  {
+    if(reg.hstride == GEN_HORIZONTAL_STRIDE_0) {
+      if(offset == 0)
+        return GenRegister::retype(reg, GEN_TYPE_UD);
+      else
+        return GenRegister::retype(GenRegister::offset(reg, 0, typeSize(GEN_TYPE_UD)*offset), GEN_TYPE_UD);
+    } else
+      return GenRegister::unpacked_ud(reg.nr, reg.subnr + offset);
+  }
+
   static void calculateFullU64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
                                   GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
   {
@@ -327,10 +338,8 @@ namespace gbe
     dst_h.type = dst_l.type = GEN_TYPE_UL;
     s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
 
-    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
-    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src1, GEN_TYPE_UD)  : GenRegister::unpacked_ud(src1.nr, src1.subnr);
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
     GenRegister s0h = GenRegister::offset(s0l, 0, 4);
     GenRegister s1h = GenRegister::offset(s1l, 0, 4);
 
@@ -350,22 +359,18 @@ namespace gbe
         overflow and have no carry.
         By this manner, we can avoid using acc register, which has a lot of restrictions. */
 
-    GenRegister dst_l_h = dst_l.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(dst_l, GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(dst_l.nr, dst_l.subnr + 1);
+    GenRegister dst_l_h = unpacked_ud(dst_l, 1);
     p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
-    GenRegister s0l_s1h_l = s0l_s1h.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(s0l_s1h, GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(s0l_s1h.nr, s0l_s1h.subnr);
+    GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
     p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
-    GenRegister s0l_s1h_h = s0l_s1h.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(s0l_s1h, GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(s0l_s1h.nr, s0l_s1h.subnr + 1);
+    GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h, 1);
     p->ADD(dst_h, dst_h, s0l_s1h_h);
 
     // No longer need s0l_s1h
     GenRegister tmp = s0l_s1h;
 
     p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
-    GenRegister tmp_unpacked = tmp.hstride == GEN_HORIZONTAL_STRIDE_0 ? GenRegister::retype(tmp, GEN_TYPE_UD) :
-      GenRegister::unpacked_ud(tmp.nr, tmp.subnr + 1);
+    GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
     p->MOV(dst_l_h, tmp_unpacked);
 
     p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
@@ -624,10 +629,8 @@ namespace gbe
     res.type = GEN_TYPE_UL;
 
     /* Low 32 bits X low 32 bits. */
-    GenRegister s0l = src0.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src0, GEN_TYPE_UD) : GenRegister::unpacked_ud(src0.nr, src0.subnr);
-    GenRegister s1l = src1.hstride == GEN_HORIZONTAL_STRIDE_0 ?
-      GenRegister::retype(src1, GEN_TYPE_UD)  : GenRegister::unpacked_ud(src1.nr, src1.subnr);
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
     p->MUL(dst, s0l, s1l);
 
     /* Low 32 bits X high 32 bits. */
-- 
2.1.0


From rong.r.yang at intel.com  Sun Mar 29 20:23:55 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Mon, 30 Mar 2015 11:23:55 +0800
Subject: [Beignet] [PATCH 1/2] CHV: Add cherryview support in the runtime.
Message-ID: <1427685836-3868-1-git-send-email-rong.r.yang@intel.com>

Cherryview's EU configurations is not decided by pciid, must get from kernel by libdrm.
Thanks for Jeff adding this support in the kernel and libdrm.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c |  8 ++++----
 backend/src/backend/gen_program.cpp       |  7 +++++++
 backend/src/gbe_bin_generater.cpp         |  4 ++++
 src/cl_device_data.h                      | 12 +++++++++++-
 src/cl_device_id.c                        | 29 ++++++++++++++++++++++++++++-
 src/intel/intel_gpgpu.c                   |  5 ++++-
 6 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 711b943..f8d89e0 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -1136,13 +1136,13 @@ int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
 {
   int err = 0;
   int space = 0;
-  if (IS_IVYBRIDGE(deviceID)) {
+  if (IS_GEN7(deviceID)) {
     gen_version = 70;
-  } else if (IS_HASWELL(deviceID)) {
+  } else if (IS_GEN75(deviceID)) {
     gen_version = 75;
-  } else if (IS_BROADWELL(deviceID)) {
+  } else if (IS_GEN8(deviceID)) {
     gen_version = 80;
-  } else if (IS_SKYLAKE(deviceID)) {
+  } else if (IS_GEN9(deviceID)) {
     gen_version = 90;
   }
 
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index f4c74f8..f53d5fb 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -166,6 +166,8 @@ namespace gbe {
       ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath);
     } else if (IS_BROADWELL(deviceID)) {
       ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
+    } else if (IS_CHERRYVIEW(deviceID)) {
+      ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
     } else if (IS_SKYLAKE(deviceID)) {
       ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
     }
@@ -210,6 +212,7 @@ namespace gbe {
                                       (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
                                       (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) ||  \
                                       (IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) ||  \
+                                      (IS_CHERRYVIEW(typeA) && !strcmp(src_hw_info, "CHV")) ||  \
                                       (IS_SKYLAKE(typeA) && !strcmp(src_hw_info, "SKL")) )
 
   static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
@@ -316,6 +319,10 @@ namespace gbe {
         src_hw_info[0]='B';
         src_hw_info[1]='D';
         src_hw_info[2]='W';
+      }else if(IS_CHERRYVIEW(prog->deviceID)){
+        src_hw_info[0]='C';
+        src_hw_info[1]='H';
+        src_hw_info[2]='V';
       }else if(IS_SKYLAKE(prog->deviceID)){
         src_hw_info[0]='S';
         src_hw_info[1]='K';
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index 8d41113..86197e1 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -178,6 +178,10 @@ void program_build_instance::serialize_program(void) throw(int)
         src_hw_info[0]='B';
         src_hw_info[1]='D';
         src_hw_info[2]='W';
+    }else if(IS_CHERRYVIEW(gen_pci_id)){
+        src_hw_info[0]='C';
+        src_hw_info[1]='H';
+        src_hw_info[2]='V';
     }else if(IS_SKYLAKE(gen_pci_id)){
         src_hw_info[0]='S';
         src_hw_info[1]='K';
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index d6f8209..6872106 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -228,7 +228,17 @@
    devid == PCI_CHIP_BROADWLL_U_GT3)
 
 #define IS_BROADWELL(devid) (IS_BRW_GT1(devid) || IS_BRW_GT2(devid) || IS_BRW_GT3(devid))
-#define IS_GEN8(devid)      IS_BROADWELL(devid)
+
+#define PCI_CHIP_CHV_0 0x22B0
+#define PCI_CHIP_CHV_1 0x22B1
+#define PCI_CHIP_CHV_2 0x22B2
+#define PCI_CHIP_CHV_3 0x22B3
+#define IS_CHERRYVIEW(devid) (devid == PCI_CHIP_CHV_0 ||   \
+   devid == PCI_CHIP_CHV_1 || \
+   devid == PCI_CHIP_CHV_2 || \
+   devid == PCI_CHIP_CHV_3)
+
+#define IS_GEN8(devid)      (IS_BROADWELL(devid) || IS_CHERRYVIEW(devid))
 
 /* SKL */
 #define PCI_CHIP_SKYLAKE_ULT_GT1	0x1906   /* Intel(R) Skylake ULT - GT1 */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index fefcef3..9d18b20 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -140,6 +140,18 @@ static struct _cl_device_id intel_brw_gt3_device = {
 #include "cl_gen75_device.h"
 };
 
+//Cherryview has the same pciid, must get the max_compute_unit and max_thread_per_unit from drm
+static struct _cl_device_id intel_chv_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 6,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
 /* XXX we clone brw now */
 static struct _cl_device_id intel_skl_gt1_device = {
   INIT_ICD(dispatch)
@@ -423,6 +435,18 @@ brw_gt3_break:
       ret = &intel_brw_gt3_device;
       break;
 
+    case PCI_CHIP_CHV_0:
+    case PCI_CHIP_CHV_1:
+    case PCI_CHIP_CHV_2:
+    case PCI_CHIP_CHV_3:
+      DECL_INFO_STRING(chv_break, intel_chv_device, name, "Intel(R) HD Graphics Cherryview");
+chv_break:
+      intel_chv_device.vendor_id = device_id;
+      intel_chv_device.platform = intel_platform;
+      ret = &intel_chv_device;
+      break;
+
+
 	  case PCI_CHIP_SKYLAKE_ULT_GT1:
 		DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake ULT GT1");
 	  case PCI_CHIP_SKYLAKE_ULX_GT1:
@@ -590,6 +614,7 @@ cl_get_device_info(cl_device_id     device,
                device != &intel_brw_gt1_device &&
                device != &intel_brw_gt2_device &&
                device != &intel_brw_gt3_device &&
+               device != &intel_chv_device &&
                device != &intel_skl_gt1_device &&
                device != &intel_skl_gt2_device &&
                device != &intel_skl_gt3_device &&
@@ -699,6 +724,7 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
                device != &intel_brw_gt1_device &&
                device != &intel_brw_gt2_device &&
                device != &intel_brw_gt3_device &&
+               device != &intel_chv_device &&
                device != &intel_skl_gt1_device &&
                device != &intel_skl_gt2_device &&
                device != &intel_skl_gt3_device &&
@@ -714,7 +740,7 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
         || device == &intel_hsw_gt3_device) {
     *ver = 75;
   } else if (device == &intel_brw_gt1_device || device == &intel_brw_gt2_device
-        || device == &intel_brw_gt3_device) {
+        || device == &intel_brw_gt3_device || device == &intel_chv_device) {
     *ver = 8;
   } else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
         || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device) {
@@ -801,6 +827,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
                device != &intel_brw_gt1_device &&
                device != &intel_brw_gt2_device &&
                device != &intel_brw_gt3_device &&
+               device != &intel_chv_device &&
                device != &intel_skl_gt1_device &&
                device != &intel_skl_gt2_device &&
                device != &intel_skl_gt3_device &&
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 177ac04..ee440dc 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1142,6 +1142,7 @@ static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_
   if (((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
         IS_HASWELL(gpgpu->drv->device_id) ||
         IS_BROADWELL(gpgpu->drv->device_id) ||
+        IS_CHERRYVIEW(gpgpu->drv->device_id) ||
         IS_SKYLAKE(gpgpu->drv->device_id))) &&
       index >= BTI_WORKAROUND_IMAGE_OFFSET + BTI_RESERVED_NUM &&
       type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
@@ -2156,13 +2157,15 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
   cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
 
-  if (IS_BROADWELL(device_id)) {
+  if (IS_BROADWELL(device_id) || IS_CHERRYVIEW(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
     cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8;
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7
     intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    if(IS_CHERRYVIEW(device_id))
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
     intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8;
     intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8;
     intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
-- 
2.1.0


From rong.r.yang at intel.com  Sun Mar 29 20:23:56 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Mon, 30 Mar 2015 11:23:56 +0800
Subject: [Beignet] [PATCH 2/2] Chv: Add chv backend support.
In-Reply-To: <1427685836-3868-1-git-send-email-rong.r.yang@intel.com>
References: <1427685836-3868-1-git-send-email-rong.r.yang@intel.com>
Message-ID: <1427685836-3868-2-git-send-email-rong.r.yang@intel.com>

The chv's backend is almost same as bdw. But some long register restrictions:
1. ARF registers must never be used with 64b datatype.
2. Source and Destination horizontal stride must be aligned to the same qword.
3. Source and Destination offset must be the same, except the case of scalar source.

Add ChvContent in gen8_context.cpp to handle it. The chv's encoder is same as Gen8Encoder.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen8_context.cpp       | 147 +++++++++++++++++++++++++++--
 backend/src/backend/gen8_context.hpp       |  23 +++++
 backend/src/backend/gen_insn_selection.cpp |  47 ++++++++-
 backend/src/backend/gen_insn_selection.hpp |   7 ++
 backend/src/backend/gen_program.cpp        |   2 +-
 5 files changed, 216 insertions(+), 10 deletions(-)

diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
index 920eb3e..283e362 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -331,7 +331,7 @@ namespace gbe
       return GenRegister::unpacked_ud(reg.nr, reg.subnr + offset);
   }
 
-  static void calculateFullU64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
+  void Gen8Context::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                   GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
   {
     src0.type = src1.type = GEN_TYPE_UD;
@@ -377,7 +377,7 @@ namespace gbe
     p->ADD(dst_h, dst_h, tmp);
   }
 
-  static void calculateFullS64MUL(GenEncoder* p, GenRegister src0, GenRegister src1, GenRegister dst_h,
+  void Gen8Context::calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
                                   GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs, 
                                   GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg)
   {
@@ -395,7 +395,7 @@ namespace gbe
     s0_abs.type = s1_abs.type = GEN_TYPE_L;
     p->MOV(s0_abs, GenRegister::abs(src0));
     p->MOV(s1_abs, GenRegister::abs(src1));
-    calculateFullU64MUL(p, s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
+    calculateFullU64MUL(s0_abs, s1_abs, dst_h, dst_l, tmp0, tmp1);
 
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
@@ -432,11 +432,11 @@ namespace gbe
 
     if(src0.type == GEN_TYPE_UL) {
       GBE_ASSERT(src1.type == GEN_TYPE_UL);
-      calculateFullU64MUL(p, src0, src1, dst_h, dst_l, tmp0, tmp1);
+      calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
     } else {
       GBE_ASSERT(src0.type == GEN_TYPE_L);
       GBE_ASSERT(src1.type == GEN_TYPE_L);
-      calculateFullS64MUL(p, src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+      calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
                           tmp1, sign, flagReg);
     }
   }
@@ -461,7 +461,7 @@ namespace gbe
       GBE_ASSERT(src2.type == GEN_TYPE_UL);
       dst_l.type = dst_h.type = GEN_TYPE_UL;
       tmp0.type = tmp1.type = GEN_TYPE_UL;
-      calculateFullU64MUL(p, src0, src1, dst_h, dst_l, tmp0, tmp1);
+      calculateFullU64MUL(src0, src1, dst_h, dst_l, tmp0, tmp1);
 
       /* Inplement the logic:
       dst_l += src2;
@@ -496,7 +496,7 @@ namespace gbe
       GBE_ASSERT(src1.type == GEN_TYPE_L);
       GBE_ASSERT(src2.type == GEN_TYPE_L);
 
-      calculateFullS64MUL(p, src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
+      calculateFullS64MUL(src0, src1, dst_h, dst_l, s0_abs, s1_abs, tmp0,
                           tmp1, sign, flagReg);
 
       GenRegister sum = sign;
@@ -904,4 +904,137 @@ namespace gbe
     memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
   }
 
+  void ChvContext::newSelection(void) {
+    this->sel = GBE_NEW(SelectionChv, *this);
+  }
+
+  void ChvContext::calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                             GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l)
+  {
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst_h.type = dst_l.type = GEN_TYPE_UL;
+    s0l_s1h.type = s0h_s1l.type = GEN_TYPE_UL;
+
+    //GenRegister tmp;
+
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    GenRegister s0h = unpacked_ud(s0l_s1h); //s0h only used before s0l_s1h, reuse s0l_s1h
+    GenRegister s1h = unpacked_ud(dst_l); //s1h only used before dst_l, reuse dst_l
+
+    p->MOV(s0h, GenRegister::offset(s0l, 0, 4));
+    p->MOV(s1h, GenRegister::offset(s1l, 0, 4));
+
+    /* High 32 bits X High 32 bits. */
+    p->MUL(dst_h, s0h, s1h);
+    /* High 32 bits X low 32 bits. */
+    p->MUL(s0h_s1l, s0h, s1l);
+    /* Low 32 bits X high 32 bits. */
+    p->MUL(s0l_s1h, s0l, s1h);
+    /* Low 32 bits X low 32 bits. */
+    p->MUL(dst_l, s0l, s1l);
+
+    /*  Because the max product of s0l*s1h is (2^N - 1) * (2^N - 1) = 2^2N + 1 - 2^(N+1), here N = 32
+        The max of addding 2 32bits integer to it is
+        2^2N + 1 - 2^(N+1) + 2*(2^N - 1) = 2^2N - 1
+        which means the product s0h_s1l adds dst_l's high 32 bits and then adds s0l_s1h's low 32 bits will not
+        overflow and have no carry.
+        By this manner, we can avoid using acc register, which has a lot of restrictions. */
+
+    GenRegister s0l_s1h_l = unpacked_ud(s0l_s1h);
+    p->ADD(s0h_s1l, s0h_s1l, s0l_s1h_l);
+
+    p->SHR(s0l_s1h, s0l_s1h, GenRegister::immud(32));
+    GenRegister s0l_s1h_h = unpacked_ud(s0l_s1h);
+    p->ADD(dst_h, dst_h, s0l_s1h_h);
+
+    GenRegister dst_l_h = unpacked_ud(s0l_s1h);
+    p->MOV(dst_l_h, unpacked_ud(dst_l, 1));
+    p->ADD(s0h_s1l, s0h_s1l, dst_l_h);
+
+    // No longer need s0l_s1h
+    GenRegister tmp = s0l_s1h;
+
+    p->SHL(tmp, s0h_s1l, GenRegister::immud(32));
+    GenRegister tmp_unpacked = unpacked_ud(tmp, 1);
+    p->MOV(unpacked_ud(dst_l, 1), tmp_unpacked);
+
+    p->SHR(tmp, s0h_s1l, GenRegister::immud(32));
+    p->ADD(dst_h, dst_h, tmp);
+  }
+
+  void ChvContext::emitI64MULInstruction(const SelectionInstruction &insn)
+  {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister res = ra->genReg(insn.dst(1));
+
+    src0.type = src1.type = GEN_TYPE_UD;
+    dst.type = GEN_TYPE_UL;
+    res.type = GEN_TYPE_UL;
+
+    /* Low 32 bits X low 32 bits. */
+    GenRegister s0l = unpacked_ud(src0);
+    GenRegister s1l = unpacked_ud(src1);
+    p->MUL(dst, s0l, s1l);
+
+    /* Low 32 bits X high 32 bits. */
+    GenRegister s1h = unpacked_ud(res);
+    p->MOV(s1h, unpacked_ud(src1, 1));
+
+    p->MUL(res, s0l, s1h);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+
+    /* High 32 bits X low 32 bits. */
+    GenRegister s0h = unpacked_ud(res);
+    p->MOV(s0h, unpacked_ud(src0, 1));
+
+    p->MUL(res, s0h, s1l);
+    p->SHL(res, res, GenRegister::immud(32));
+    p->ADD(dst, dst, res);
+  }
+
+  void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int sz) {
+    int16_t diff = new_a0[0] - this->a0[0];
+    if (sz == 0)
+      sz = 16;
+    GBE_ASSERT(sz%4 == 0);
+    GBE_ASSERT(new_a0[0] >= 0 && new_a0[0] < 4096);
+    bool need_reset = false;
+    for (int i = 1; i < sz; i++) {
+      GBE_ASSERT(new_a0[i] >= 0 && new_a0[0] < 4096);
+      int16_t d = new_a0[i] - this->a0[i];
+      if (diff != d) {
+        need_reset = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(this->a0[0] + diff < 4096 && this->a0[0] + diff >= 0);
+    if (!need_reset && diff >= -512 && diff + max_offset <= 511) {
+      return;
+    } else if (!need_reset && sz == 16) {
+      p->push();
+      p->curr.execWidth = 16;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->ADD(GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W),
+          GenRegister::retype(GenRegister::addr8(0), GEN_TYPE_W), GenRegister::immw(diff));
+      p->pop();
+    } else {
+      p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      for (int i = 0; i < sz/2; i++) {
+        p->MOV(GenRegister::retype(GenRegister::addr1(i*2), GEN_TYPE_UD),
+            GenRegister::immud(new_a0[i*2 + 1] << 16 | new_a0[i*2]));
+      }
+      p->pop();
+    }
+    memcpy(this->a0, new_a0, sizeof(uint16_t)*sz);
+  }
+
 }
diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp
index b296a3d..8827955 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -82,6 +82,29 @@ namespace gbe
     virtual void newSelection(void);
     void packLongVec(GenRegister unpacked, GenRegister packed, uint32_t simd);
     void unpackLongVec(GenRegister packed, GenRegister unpacked, uint32_t simd);
+    void calculateFullS64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                             GenRegister dst_l, GenRegister s0_abs, GenRegister s1_abs,
+                             GenRegister tmp0, GenRegister tmp1, GenRegister sign, GenRegister flagReg);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
+  };
+
+  class ChvContext : public Gen8Context
+  {
+  public:
+    virtual ~ChvContext(void) { }
+    ChvContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : Gen8Context(unit, name, deviceID, relaxMath) {
+    };
+    virtual void emitI64MULInstruction(const SelectionInstruction &insn);
+
+  protected:
+    virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, int sz = 0);
+
+  private:
+    virtual void newSelection(void);
+    virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h,
+                                           GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l);
   };
 }
 #endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7f9c95a..67a1d95 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -361,6 +361,8 @@ namespace gbe
     void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
     bool hasLongType() const { return bHasLongType; }
     void setHasLongType(bool b) { bHasLongType = b; }
+    bool hasLongRegRestrict() { return bLongRegRestrict; }
+    void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
     void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }
     uint32_t getLdMsgOrder()  const { return ldMsgOrder; }
     /*! indicate whether a register is a scalar/uniform register. */
@@ -662,6 +664,7 @@ namespace gbe
     uint16_t currAuxLabel;
     bool bHas32X32Mul;
     bool bHasLongType;
+    bool bLongRegRestrict;
     uint32_t ldMsgOrder;
     INLINE ir::LabelIndex newAuxLabel()
     {
@@ -702,7 +705,7 @@ namespace gbe
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false), bHasLongType(false), ldMsgOrder(LD_MSG_ORDER_IVB)
+    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -1860,6 +1863,12 @@ namespace gbe
     this->opaque->setHasLongType(true);
   }
 
+  SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+    this->opaque->setHasLongType(true);
+    this->opaque->setLongRegRestrict(true);
+  }
+
   Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
@@ -4030,7 +4039,41 @@ namespace gbe
             sel.MOV(dst, unpacked);
           }
         }
-      } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+      }   else if (sel.hasLongType() && sel.hasLongRegRestrict() && dstFamily == FAMILY_QWORD && srcFamily != FAMILY_QWORD) {
+        // Convert i32/i16/i8/float to i64/double if hasLongRegRestrict(src and dst hstride must be aligned to the same qword).
+        GenRegister unpacked;
+        GenRegister unpacked_src = src;
+
+        sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+
+          if(srcType == ir::TYPE_FLOAT) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, GEN_TYPE_F);
+          } else if(srcFamily == FAMILY_DWORD) {
+            unpacked = sel.unpacked_ud(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UD : GEN_TYPE_D);
+          } else if(srcFamily == FAMILY_WORD) {
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+          } else if(srcFamily == FAMILY_BYTE) {
+            GenRegister tmp = sel.selReg(sel.reg(FAMILY_WORD, sel.isScalarReg(insn.getSrc(0))));
+            tmp = GenRegister::retype(tmp, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_QWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, dstType == TYPE_U64 ? GEN_TYPE_UW : GEN_TYPE_W);
+            sel.MOV(tmp, src);
+            unpacked_src = tmp;
+          } else
+            GBE_ASSERT(0);
+
+          sel.MOV(unpacked, unpacked_src);
+        sel.pop();
+        sel.MOV(dst, unpacked);
+      }else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
                  (src.isdf() && dstType == ir::TYPE_FLOAT)) { // float and double conversion
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
         sel.MOV_DF(dst, src, sel.selReg(r, TYPE_U64));
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 6a08180..ee5e46f 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -294,6 +294,13 @@ namespace gbe
       Selection8(GenContext &ctx);
   };
 
+  class SelectionChv: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      SelectionChv(GenContext &ctx);
+  };
+
   class Selection9: public Selection
   {
     public:
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index f53d5fb..c761a2f 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -167,7 +167,7 @@ namespace gbe {
     } else if (IS_BROADWELL(deviceID)) {
       ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
     } else if (IS_CHERRYVIEW(deviceID)) {
-      ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
+      ctx = GBE_NEW(ChvContext, unit, name, deviceID, relaxMath);
     } else if (IS_SKYLAKE(deviceID)) {
       ctx = GBE_NEW(Gen9Context, unit, name, deviceID, relaxMath);
     }
-- 
2.1.0


From feng.yuan at intel.com  Mon Mar 30 19:00:53 2015
From: feng.yuan at intel.com (Yuan, Feng)
Date: Tue, 31 Mar 2015 02:00:53 +0000
Subject: [Beignet] [PATCH v2 1/3] Add extension
	clCloseMemObjectFdIntel().
In-Reply-To: <1427474041-9171-1-git-send-email-chuanbo.weng@intel.com>
References: <1427474041-9171-1-git-send-email-chuanbo.weng@intel.com>
Message-ID: <D04FD836CC6FBE4FAC7A57BECF784F5E01C90FDD@SHSMSX101.ccr.corp.intel.com>

Looks good to me.

>-----Original Message-----
>From: Weng, Chuanbo
>Sent: Saturday, March 28, 2015 12:34 AM
>To: beignet at lists.freedesktop.org
>Cc: Yuan, Feng; Weng, Chuanbo
>Subject: [PATCH v2 1/3] Add extension clCloseMemObjectFdIntel().
>
>We have added extension clGetMemObjectFdIntel to export fd of memory
>object,so we have to added corresponding extension to close the fd.
>
>Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
>---
> include/CL/cl_intel.h    | 11 ++++++++++-
> src/cl_api.c             | 15 +++++++++++++++
> src/cl_driver.h          |  3 +++
> src/cl_driver_defs.c     |  1 +
> src/cl_mem.c             | 32 ++++++++++++++++++++++++++++++--
> src/cl_mem.h             |  4 ++++
> src/intel/intel_driver.c |  1 +
> 7 files changed, 64 insertions(+), 3 deletions(-)
>
>diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h index
>28bcb62..2ab79ad 100644
>--- a/include/CL/cl_intel.h
>+++ b/include/CL/cl_intel.h
>@@ -122,7 +122,7 @@ typedef CL_API_ENTRY cl_mem (CL_API_CALL
>*clCreateImageFromLibvaIntel_fn)(
>                              const cl_libva_image * /* info */,
>                              cl_int *               /* errcode_ret
>*/);
>
>-/* Create buffer from libva's buffer object */
>+/*Export memory object's fd*/
> extern CL_API_ENTRY cl_int CL_API_CALL
> clGetMemObjectFdIntel(cl_context   /* context */,
>                       cl_mem       /* Memory Obejct */,
>@@ -133,6 +133,15 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL
>*clGetMemObjectFdIntel_fn)(
>                              cl_mem       /* Memory Obejct */,
>                              int*         /* returned fd */);
>
>+/*Close memory object's fd*/
>+extern CL_API_ENTRY cl_int CL_API_CALL
>+clCloseMemObjectFdIntel(cl_context   /* context */,
>+                        cl_mem       /* Memory Obejct */);
>+
>+typedef CL_API_ENTRY cl_int (CL_API_CALL *clCloseMemObjectFdIntel_fn)(
>+                             cl_context   /* context */,
>+                             cl_mem       /* Memory Obejct */);
>+
> #ifdef __cplusplus
> }
> #endif
>diff --git a/src/cl_api.c b/src/cl_api.c index 3e72deb..f9efac7 100644
>--- a/src/cl_api.c
>+++ b/src/cl_api.c
>@@ -3180,6 +3180,7 @@ internal_clGetExtensionFunctionAddress(const
>char *func_name)
>   EXTFUNC(clCreateBufferFromLibvaIntel)
>   EXTFUNC(clCreateImageFromLibvaIntel)
>   EXTFUNC(clGetMemObjectFdIntel)
>+  EXTFUNC(clCloseMemObjectFdIntel)
>   return NULL;
> }
>
>@@ -3348,3 +3349,17 @@ clGetMemObjectFdIntel(cl_context context,
> error:
>   return err;
> }
>+
>+extern CL_API_ENTRY cl_int CL_API_CALL
>+clCloseMemObjectFdIntel(cl_context context,
>+                        cl_mem memobj)
>+{
>+  cl_int err = CL_SUCCESS;
>+  CHECK_CONTEXT (context);
>+  CHECK_MEM (memobj);
>+
>+  err = cl_mem_close_fd(memobj);
>+
>+error:
>+  return err;
>+}
>diff --git a/src/cl_driver.h b/src/cl_driver.h index 3f54a27..dd028c0 100644
>--- a/src/cl_driver.h
>+++ b/src/cl_driver.h
>@@ -369,6 +369,9 @@ extern cl_buffer_wait_rendering_cb
>*cl_buffer_wait_rendering;  typedef int (cl_buffer_get_fd_cb)(cl_buffer, int
>*fd);  extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
>
>+typedef int (cl_buffer_close_fd_cb)(int fd); extern
>+cl_buffer_close_fd_cb *cl_buffer_close_fd;
>+
> typedef int (cl_buffer_get_tiling_align_cb)(cl_context ctx, uint32_t
>tiling_mode, uint32_t dim);  extern cl_buffer_get_tiling_align_cb
>*cl_buffer_get_tiling_align;
>
>diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index
>9a47210..8018747 100644
>--- a/src/cl_driver_defs.c
>+++ b/src/cl_driver_defs.c
>@@ -51,6 +51,7 @@ LOCAL cl_buffer_wait_rendering_cb
>*cl_buffer_wait_rendering = NULL;  LOCAL
>cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva =
>NULL;  LOCAL cl_buffer_get_image_from_libva_cb
>*cl_buffer_get_image_from_libva = NULL;  LOCAL cl_buffer_get_fd_cb
>*cl_buffer_get_fd = NULL;
>+LOCAL cl_buffer_close_fd_cb *cl_buffer_close_fd = NULL;
> LOCAL cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align = NULL;
>
> /* cl_khr_gl_sharing */
>diff --git a/src/cl_mem.c b/src/cl_mem.c index b41ec14..08fb239 100644
>--- a/src/cl_mem.c
>+++ b/src/cl_mem.c
>@@ -267,6 +267,8 @@ cl_mem_allocate(enum cl_mem_type type,
>   mem->flags = flags;
>   mem->is_userptr = 0;
>   mem->offset = 0;
>+  mem->export_ref = 0;
>+  pthread_mutex_init(&mem->export_lock, NULL);
>
>   if (sz != 0) {
>     /* Pinning will require stricter alignment rules */ @@ -2051,7 +2053,33
>@@ cl_mem_get_fd(cl_mem mem,
>               int* fd)
> {
>   cl_int err = CL_SUCCESS;
>-  if(cl_buffer_get_fd(mem->bo, fd))
>-	err = CL_INVALID_OPERATION;
>+
>+  pthread_mutex_lock(&mem->export_lock);
>+  if(mem->export_ref == 0){
>+    if(cl_buffer_get_fd(mem->bo, fd))
>+      err = CL_INVALID_OPERATION;
>+    mem->export_fd = *fd;
>+  }
>+  else{
>+    *fd = mem->export_fd;
>+  }
>+  mem->export_ref++;
>+  pthread_mutex_unlock(&mem->export_lock);
>+
>+  return err;
>+}
>+
>+LOCAL cl_int
>+cl_mem_close_fd(cl_mem mem)
>+{
>+  cl_int err = CL_SUCCESS;
>+
>+  if(mem->export_ref == 0)
>+    return CL_INVALID_MEM_OBJECT;
>+  if (atomic_dec(&mem->export_ref) > 1)
>+    return CL_SUCCESS;
>+  if(cl_buffer_close_fd(mem->export_fd))
>+    err = CL_INVALID_OPERATION;
>+
>   return err;
> }
>diff --git a/src/cl_mem.h b/src/cl_mem.h index e027f15..aea2ade 100644
>--- a/src/cl_mem.h
>+++ b/src/cl_mem.h
>@@ -95,6 +95,9 @@ typedef  struct _cl_mem {
>   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
>   uint8_t is_userptr;       /* CL_MEM_USE_HOST_PTR is enabled*/
>   size_t offset;            /* offset of host_ptr to the page beginning,
>only for CL_MEM_USE_HOST_PTR*/
>+  pthread_mutex_t export_lock; /* To export fd */  volatile int
>+ export_ref;  /* The exported count */
>+  int export_fd;            /* The exported fd of this memory object */
> } _cl_mem;
>
> struct _cl_mem_image {
>@@ -294,6 +297,7 @@ extern cl_mem cl_mem_new_libva_image(cl_context
>ctx,
>                                      size_t row_pitch,
>                                      cl_int *errcode);  extern
>cl_int cl_mem_get_fd(cl_mem mem, int* fd);
>+extern cl_int cl_mem_close_fd(cl_mem mem);
>
>
> #endif /* __CL_MEM_H__ */
>diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index
>755ab6b..37fb262 100644
>--- a/src/intel/intel_driver.c
>+++ b/src/intel/intel_driver.c
>@@ -840,6 +840,7 @@ intel_setup_callbacks(void)
>   cl_buffer_get_subdata = (cl_buffer_get_subdata_cb *)
>drm_intel_bo_get_subdata;
>   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *)
>drm_intel_bo_wait_rendering;
>   cl_buffer_get_fd = (cl_buffer_get_fd_cb *)
>drm_intel_bo_gem_export_to_prime;
>+  cl_buffer_close_fd = (cl_buffer_close_fd_cb *) close;
>   cl_buffer_get_tiling_align = (cl_buffer_get_tiling_align_cb
>*)intel_buffer_get_tiling_align;
>   intel_set_gpgpu_callbacks(intel_get_device_id());
> }
>--
>1.9.1


From rong.r.yang at intel.com  Tue Mar 31 01:39:03 2015
From: rong.r.yang at intel.com (Yang Rong)
Date: Tue, 31 Mar 2015 16:39:03 +0800
Subject: [Beignet] [PATCH] Fix a segmentation fault.
Message-ID: <1427791143-22161-1-git-send-email-rong.r.yang@intel.com>

There is a segmentation fault in function isSrcDstDiffSpan, when src's hstrde
is not GEN_HORIZONTAL_STRIDE_0 but dst's hstride is GEN_HORIZONTAL_STRIDE_0.

This is wrong state, and the LoadInstruction using GenRegister::udxgrf with simd is 1,
will introduce this state, when dst is scalar. Use sel.selReg instead of GenRegister::udxgrf.

Signed-off-by: Yang Rong <rong.r.yang at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7f9c95a..058d22b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3069,7 +3069,7 @@ namespace gbe
 
       GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
       // get dword based address
-      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+      GenRegister addrDW = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
 
       sel.push();
         if (sel.isScalarReg(addr.reg())) {
@@ -3116,9 +3116,9 @@ namespace gbe
                         uint8_t bti) const
     {
       using namespace ir;
-        Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
-        GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
+        Register tmpReg = sel.reg(FAMILY_DWORD);
+        GenRegister tmpAddr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        GenRegister tmpData = sel.selReg(tmpReg, ir::TYPE_U32);
         // Get dword aligned addr
         sel.push();
           if (simdWidth == 1) {
@@ -3154,8 +3154,6 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
-      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
-                                 1 : sel.ctx.getSimdWidth();
       RegisterFamily family = getFamily(insn.getValueType());
 
       vector<GenRegister> dst(valueNum);
@@ -3170,7 +3168,7 @@ namespace gbe
       vector<Register> tmpReg(tmpRegNum);
       for(uint32_t i = 0; i < tmpRegNum; i++) {
         tmpReg[i] = sel.reg(FAMILY_DWORD);
-        tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
+        tmp2[i] = tmp[i] = sel.selReg(tmpReg[i], ir::TYPE_U32);
       }
 
       readDWord(sel, tmp, tmp2, address, tmpRegNum, bti);
@@ -3254,9 +3252,9 @@ namespace gbe
         vector<GenRegister> tmp2(effectDataNum + 1);
         vector<GenRegister> effectData(effectDataNum);
         for(uint32_t i = 0; i < effectDataNum + 1; i++)
-          tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+          tmp2[i] = tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
 
-        GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        GenRegister alignedAddr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
         sel.push();
           if (simdWidth == 1)
             sel.curr.noMask = 1;
@@ -3465,7 +3463,7 @@ namespace gbe
       } else {
         const GenRegister value = sel.selReg(insn.getValue(0));
         GBE_ASSERT(insn.getValueNum() == 1);
-        const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        const GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
         if (elemSize == GEN_BYTE_SCATTER_WORD) {
           sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
         } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
-- 
1.8.3.2


From xionghu.luo at intel.com  Tue Mar 31 17:48:25 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Wed,  1 Apr 2015 08:48:25 +0800
Subject: [Beignet] [PATCH 1/3] add benckmark for copy data from buffer to
	image.
Message-ID: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 benchmark/CMakeLists.txt                     |  1 +
 benchmark/benchmark_copy_buffer_to_image.cpp | 67 ++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 benchmark/benchmark_copy_buffer_to_image.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 7bd61ee..3e43a21 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -15,6 +15,7 @@ set (benchmark_sources
   benchmark_use_host_ptr_buffer.cpp
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
+  benchmark_copy_buffer_to_image.cpp
   benchmark_copy_image_to_buffer.cpp)
 
 
diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp
new file mode 100644
index 0000000..c3eee13
--- /dev/null
+++ b/benchmark/benchmark_copy_buffer_to_image.cpp
@@ -0,0 +1,67 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define IMAGE_BPP 2
+
+double benchmark_copy_buffer_to_image(void)
+{
+  struct timeval start,stop;
+  const size_t w = 960 * 4;
+  const size_t h = 540 * 4;
+  const size_t sz = IMAGE_BPP * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup image and buffer
+  buf_data[0] = (unsigned short*) malloc(sz);
+  for (uint32_t i = 0; i < w*h; ++i) {
+    ((unsigned short*)buf_data[0])[i] = i;//(rand() & 0xffff);
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT16;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, buf_data[0]);
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  /*copy image to buffer*/
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, 1};
+
+  OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  OCL_FINISH();
+  OCL_MAP_BUFFER_GTT(1);
+  /*check result*/
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+    {
+      OCL_ASSERT(((unsigned short*)buf_data[0])[j * w + i] == ((unsigned short*)buf_data[1])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER_GTT(1);
+  gettimeofday(&start,0);
+
+  for (uint32_t i=0; i<100; i++) {
+    OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image);
+
-- 
1.9.1


From xionghu.luo at intel.com  Tue Mar 31 17:48:27 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Wed,  1 Apr 2015 08:48:27 +0800
Subject: [Beignet] [PATCH 3/3] Add missing code.
In-Reply-To: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
References: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1427849307-17120-3-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

region0 is not initialized.

Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 src/cl_mem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index d3e92f1..0a2613d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1833,6 +1833,7 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   intel_fmt = image->intel_fmt;
   bpp = image->bpp;
   w_saved = image->w;
+  region0 = region[0] * bpp;
   kn_src_offset = src_offset;
   if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
       ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
-- 
1.9.1


From xionghu.luo at intel.com  Tue Mar 31 17:48:26 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Wed,  1 Apr 2015 08:48:26 +0800
Subject: [Beignet] [PATCH 2/3] Optimization of clEnqueueCopyBufferToImage
	for 16 aligned case.
In-Reply-To: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
References: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1427849307-17120-2-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

We can change the image_channel_order to CL_RGBA and
image_channel_data_type to CL_UNSIGNED_INT32 for some special
case, thus 16 bytes can be read by one work item. Bandwidth is
fully used.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 src/CMakeLists.txt                                 |  2 +-
 src/cl_context.h                                   |  1 +
 src/cl_mem.c                                       | 43 ++++++++++++++++++----
 .../cl_internal_copy_buffer_to_image_2d_align16.cl | 19 ++++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index da69532..4e67c71 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra
 cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
-cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
diff --git a/src/cl_context.h b/src/cl_context.h
index fdbfd2a..249fed8 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -63,6 +63,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b41ec14..d3e92f1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1816,6 +1816,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
+  size_t kn_src_offset;
+  int align16 = 0;
+  size_t align_size = 1;
+  size_t w_saved = 0;
 
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
@@ -1826,24 +1830,47 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
-  fmt.image_channel_order = CL_R;
-  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
   intel_fmt = image->intel_fmt;
   bpp = image->bpp;
+  w_saved = image->w;
+  kn_src_offset = src_offset;
+  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+      ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
+    fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+    align16 = 1;
+    align_size = 16;
+  }
+  else{
+    fmt.image_channel_order = CL_R;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+    align_size = 1;
+  }
   image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = image->w * image->bpp;
-  image->bpp = 1;
-  region0 = region[0] * bpp;
-  origin0 = dst_origin[0] * bpp;
+  image->w = (image->w * image->bpp) / align_size;
+  image->bpp = align_size;
+  region0 = (region[0] * bpp) / align_size;
+  origin0 = (dst_origin[0] * bpp) / align_size;
+  kn_src_offset /= align_size;
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
   if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(align16){
+      extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
+      extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+                cl_internal_copy_buffer_to_image_2d_align16_str,
+                (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
+    }
+    else{
       extern char cl_internal_copy_buffer_to_image_2d_str[];
       extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
 
       ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
           cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+    }
   }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
       extern char cl_internal_copy_buffer_to_image_3d_str[];
       extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
@@ -1862,13 +1889,13 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
   cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
 
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
-  image->w = image->w / bpp;
+  image->w = w_saved;
 
   return ret;
 }
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..4e216ea
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__read_only image2d_t image, global uint4* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += (k * region1 + j) * region0 + i;
+  color = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
+
-- 
1.9.1


From xionghu.luo at intel.com  Tue Mar 31 17:51:32 2015
From: xionghu.luo at intel.com (Luo, Xionghu)
Date: Wed, 1 Apr 2015 00:51:32 +0000
Subject: [Beignet] [PATCH 1/3] add benckmark for copy data from buffer
	to image.
In-Reply-To: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
References: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <894E4BC922C573429354F1EC4342D61C0F5273A7@SHSMSX101.ccr.corp.intel.com>

Hi Chuanbo,
Please review this patchset for me.
It adds aligned copy data from buffer to image 2d (only image 2d to buffer before).

Luo Xionghu
Best Regards

-----Original Message-----
From: Luo, Xionghu 
Sent: Wednesday, April 1, 2015 8:48 AM
To: beignet at lists.freedesktop.org
Cc: Luo, Xionghu
Subject: [PATCH 1/3] add benckmark for copy data from buffer to image.

From: Luo Xionghu <xionghu.luo at intel.com>

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 benchmark/CMakeLists.txt                     |  1 +
 benchmark/benchmark_copy_buffer_to_image.cpp | 67 ++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 benchmark/benchmark_copy_buffer_to_image.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 7bd61ee..3e43a21 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -15,6 +15,7 @@ set (benchmark_sources
   benchmark_use_host_ptr_buffer.cpp
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
+  benchmark_copy_buffer_to_image.cpp
   benchmark_copy_image_to_buffer.cpp)
 
 
diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp
new file mode 100644
index 0000000..c3eee13
--- /dev/null
+++ b/benchmark/benchmark_copy_buffer_to_image.cpp
@@ -0,0 +1,67 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define IMAGE_BPP 2
+
+double benchmark_copy_buffer_to_image(void)
+{
+  struct timeval start,stop;
+  const size_t w = 960 * 4;
+  const size_t h = 540 * 4;
+  const size_t sz = IMAGE_BPP * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));  memset(&format, 0x0, 
+ sizeof(cl_image_format));
+
+  // Setup image and buffer
+  buf_data[0] = (unsigned short*) malloc(sz);  for (uint32_t i = 0; i < 
+ w*h; ++i) {
+    ((unsigned short*)buf_data[0])[i] = i;//(rand() & 0xffff);  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT16;  desc.image_type 
+ = CL_MEM_OBJECT_IMAGE2D;  desc.image_width = w;  desc.image_height = 
+ h;  desc.image_row_pitch = 0;  OCL_CREATE_BUFFER(buf[0], 
+ CL_MEM_COPY_HOST_PTR, sz, buf_data[0]);  OCL_CREATE_IMAGE(buf[1], 0, 
+ &format, &desc, NULL);
+
+  /*copy image to buffer*/
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, 1};
+
+  OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  OCL_FINISH();
+  OCL_MAP_BUFFER_GTT(1);
+  /*check result*/
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+    {
+      OCL_ASSERT(((unsigned short*)buf_data[0])[j * w + i] == ((unsigned short*)buf_data[1])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER_GTT(1);
+  gettimeofday(&start,0);
+
+  for (uint32_t i=0; i<100; i++) {
+    OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image);
+
--
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:36 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:36 +0800
Subject: [Beignet] [PATCH 1/8] strip unsupported attributes and calling
	conventions.
Message-ID: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/CMakeLists.txt            |   1 +
 backend/src/llvm/StripAttributes.cpp  | 119 ++++++++++++++++++++++++++++++++++
 backend/src/llvm/llvm_gen_backend.cpp |   7 +-
 backend/src/llvm/llvm_gen_backend.hpp |   4 +-
 backend/src/llvm/llvm_scalarize.cpp   |   1 +
 backend/src/llvm/llvm_to_gen.cpp      |   6 +-
 6 files changed, 134 insertions(+), 4 deletions(-)
 create mode 100644 backend/src/llvm/StripAttributes.cpp

diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index a21918c..a6736ec 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -88,6 +88,7 @@ set (GBE_SRC
     llvm/ExpandUtils.cpp
     llvm/PromoteIntegers.cpp
     llvm/ExpandLargeIntegers.cpp
+    llvm/StripAttributes.cpp
     llvm/llvm_to_gen.cpp
     llvm/llvm_loadstore_optimization.cpp
     llvm/llvm_gen_backend.hpp
diff --git a/backend/src/llvm/StripAttributes.cpp b/backend/src/llvm/StripAttributes.cpp
new file mode 100644
index 0000000..05cac17
--- /dev/null
+++ b/backend/src/llvm/StripAttributes.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+// Imported from pNaCl project
+// Copyright (c) 2003-2014 University of Illinois at Urbana-Champaign.
+// All rights reserved.
+//
+// Developed by:
+//
+//    LLVM Team
+//
+//    University of Illinois at Urbana-Champaign
+//
+//    http://llvm.org
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+//    * Redistributions of source code must retain the above copyright notice,
+//      this list of conditions and the following disclaimers.
+//
+//   * Redistributions in binary form must reproduce the above copyright notice,
+//      this list of conditions and the following disclaimers in the
+//      documentation and/or other materials provided with the distribution.
+//
+//    * Neither the names of the LLVM Team, University of Illinois at
+//      Urbana-Champaign, nor the names of its contributors may be used to
+//      endorse or promote products derived from this Software without specific
+//      prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass strips out attributes that are not supported by Beignet.
+// Currently, this strips out:
+//
+//  * Calling conventions from functions and function calls.
+//
+
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#else
+#include "llvm/Support/CallSite.h"
+#endif
+
+#include "llvm_gen_backend.hpp"
+
+using namespace llvm;
+
+namespace {
+  class StripAttributes : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    StripAttributes() : FunctionPass(ID) {
+    }
+
+    virtual bool runOnFunction(Function &Func);
+  };
+}
+
+char StripAttributes::ID = 0;
+
+bool StripAttributes::runOnFunction(Function &Func) {
+  if (!gbe::isKernelFunction(Func))
+    Func.addFnAttr(Attribute::AlwaysInline);
+  Func.setCallingConv(CallingConv::C);
+  Func.setLinkage(GlobalValue::ExternalLinkage);
+
+  for (Function::iterator BB = Func.begin(), E = Func.end();
+       BB != E; ++BB) {
+    for (BasicBlock::iterator Inst = BB->begin(), E = BB->end();
+         Inst != E; ++Inst) {
+      CallSite Call(Inst);
+      if (Call)
+        Call.setCallingConv(CallingConv::C);
+    }
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createStripAttributesPass() {
+  return new StripAttributes();
+}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 813d0d3..ec79628 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2791,8 +2791,13 @@ namespace gbe
         break;
       case GEN_OCL_PRINTF:
         break;
+      case GEN_OCL_NOT_FOUND:
       default:
-        GBE_ASSERTM(false, "Function call are not supported yet");
+        std::cerr << "Caller instruction: " << std::endl;
+        I.dump();
+        std::cerr << "Callee function: " << std::endl;
+        Callee->dump();
+        GBE_ASSERT(0);
     };
   }
 
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 91a1166..5724917 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -50,6 +50,7 @@ namespace llvm {
   FunctionPass *createExpandConstantExprPass();
   FunctionPass *createExpandLargeIntegersPass();
   FunctionPass *createPromoteIntegersPass();
+  FunctionPass *createStripAttributesPass();
   // Copy debug information from Original to New, and return New.
   template <typename T> T *CopyDebug(T *New, llvm::Instruction *Original) {
     New->setDebugLoc(Original->getDebugLoc());
@@ -66,6 +67,7 @@ namespace gbe
   enum OCLInstrinsic {
 #define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
 #include "llvm_gen_ocl_function.hxx"
+  GEN_OCL_NOT_FOUND,   
 #undef DECL_LLVM_GEN_FUNCTION
   };
 
@@ -97,7 +99,7 @@ namespace gbe
       if (it == map.end()) {
         std::cerr << "Unresolved symbol: " << symbol << std::endl;
         std::cerr << "Aborting..." << std::endl;
-        exit(-1);
+        return GEN_OCL_NOT_FOUND; 
       }
       return it->second;
     }
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 860053f..b657246 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -643,6 +643,7 @@ namespace gbe {
         CallSite::arg_iterator CI = CS.arg_begin() + 1;
 
         switch (genIntrinsicID) {
+          case GEN_OCL_NOT_FOUND:
           default: break;
           case GEN_OCL_READ_IMAGE_I:
           case GEN_OCL_READ_IMAGE_UI:
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 89a22b6..a4ce4a2 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -133,8 +133,9 @@ namespace gbe
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
     MPM.add(createPruneEHPass());             // Remove dead EH info
+    MPM.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
     MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
-    MPM.add(createFunctionInliningPass(200000));
+    MPM.add(createFunctionInliningPass(20000));
     MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
     MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
 
@@ -275,7 +276,8 @@ namespace gbe
 #endif
     // Print the code before further optimizations
     passes.add(createIntrinsicLoweringPass());
-    passes.add(createFunctionInliningPass(200000));
+    passes.add(createStripAttributesPass());     // Strip unsupported attributes and calling conventions.
+    passes.add(createFunctionInliningPass(20000));
     passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
     passes.add(createLoadStoreOptimizationPass());
     passes.add(createConstantPropagationPass());
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:37 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:37 +0800
Subject: [Beignet] [PATCH 2/8] GBE: fix safe type definition.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-2-git-send-email-zhigang.gong@intel.com>

Should not use hard coded uint16_t for safe type definition.
Prepare to extend some types to uint32_t.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/sys/platform.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
index 173b880..803ce21 100644
--- a/backend/src/sys/platform.hpp
+++ b/backend/src/sys/platform.hpp
@@ -248,7 +248,7 @@ class SAFE \
 { \
 public: \
   INLINE SAFE(void) {} \
-  explicit INLINE SAFE(uint16_t unsafe) : unsafe(unsafe) {} \
+  explicit INLINE SAFE(UNSAFE unsafe) : unsafe(unsafe) {} \
   INLINE operator UNSAFE (void) const { return unsafe; } \
   UNSAFE value(void) const { return unsafe; } \
 private: \
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:38 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:38 +0800
Subject: [Beignet] [PATCH 3/8] GBE: extend registers/tuples/immediates to
	32bit wide.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-3-git-send-email-zhigang.gong@intel.com>

For some extremly large kernel, these values may be larger than
0xFFFF, we have to extend them to 32 bit.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 18 +++++++++---------
 backend/src/backend/gen_reg_allocation.cpp | 12 ++++++------
 backend/src/backend/gen_register.hpp       |  4 ++--
 backend/src/ir/immediate.hpp               |  2 +-
 backend/src/ir/instruction.cpp             |  7 ++++---
 backend/src/ir/instruction.hpp             |  4 ++--
 backend/src/ir/register.hpp                | 12 ++++++------
 7 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 0f5e496..5586468 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2001,7 +2001,7 @@ namespace gbe
                 if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
                     dag->isUsed) {
                 sel.curr.physicalFlag = 0;
-                sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+                sel.curr.flagIndex = (uint32_t)(insn.getDst(0));
                 sel.curr.modFlag = 1;
               }
               sel.MOV(dst, src);
@@ -2209,7 +2209,7 @@ namespace gbe
                    insn.getOpcode() == OP_OR ||
                    insn.getOpcode() == OP_XOR);
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+        sel.curr.flagIndex = (uint32_t)(insn.getDst(0));
         sel.curr.modFlag = 1;
       }
 
@@ -2782,7 +2782,7 @@ namespace gbe
           if (!sel.isScalarReg(insn.getDst(0)) && sel.regDAG[insn.getDst(0)]->isUsed) {
             sel.curr.modFlag = 1;
             sel.curr.physicalFlag = 0;
-            sel.curr.flagIndex = (uint16_t) insn.getDst(0);
+            sel.curr.flagIndex = (uint32_t) insn.getDst(0);
           }
           sel.MOV(dst, imm.getIntegerValue() ? GenRegister::immuw(0xffff) : GenRegister::immuw(0));
         break;
@@ -3042,7 +3042,7 @@ namespace gbe
           sel.curr.physicalFlag = 0;
           sel.curr.modFlag = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.curr.flagIndex = (uint16_t)alignedFlag;
+          sel.curr.flagIndex = (uint32_t)alignedFlag;
           sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shiftHReg), GenRegister::immuw(32));
         sel.pop();
 
@@ -3055,7 +3055,7 @@ namespace gbe
             // Only need to consider the tmpH when the addr is not aligned.
             sel.curr.modFlag = 0;
             sel.curr.physicalFlag = 0;
-            sel.curr.flagIndex = (uint16_t)alignedFlag;
+            sel.curr.flagIndex = (uint32_t)alignedFlag;
             sel.curr.predicate = GEN_PREDICATE_NORMAL;
             sel.SHL(tmpH, tmp[i + 1], shiftH);
             sel.OR(effectData[i], tmpL, tmpH);
@@ -3377,7 +3377,7 @@ namespace gbe
           sel.curr.noMask = 1;
         sel.curr.physicalFlag = 0;
         sel.curr.modFlag = 1;
-        sel.curr.flagIndex = (uint16_t)dst;
+        sel.curr.flagIndex = (uint32_t)dst;
         sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
         if (type == TYPE_S64 || type == TYPE_U64) {
           GenRegister tmp[3];
@@ -3791,7 +3791,7 @@ namespace gbe
         }
         sel.curr.inversePredicate ^= inverse;
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = (uint16_t) pred;
+        sel.curr.flagIndex = (uint32_t) pred;
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
         // FIXME in general, if the flag is a uniform flag.
         // we should treat that flag as extern flag, as we
@@ -4204,7 +4204,7 @@ namespace gbe
           // as if there is no backward jump latter, then obviously everything will work fine.
           // If there is backward jump latter, then all the pcip will be updated correctly there.
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = (uint16_t) pred;
+          sel.curr.flagIndex = (uint32_t) pred;
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
           sel.curr.predicate = GEN_PREDICATE_NONE;
@@ -4261,7 +4261,7 @@ namespace gbe
         GBE_ASSERT(jip == dst);
         sel.push();
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = (uint16_t) pred;
+          sel.curr.flagIndex = (uint32_t) pred;
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
           sel.block->endifOffset = -1;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 26078e0..a5d601a 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -54,14 +54,14 @@ namespace gbe
   };
 
   typedef struct GenRegIntervalKey {
-    GenRegIntervalKey(uint16_t reg, int32_t maxID) {
-      key = ((uint64_t)maxID << 16) | reg;
+    GenRegIntervalKey(uint32_t reg, int32_t maxID) {
+      key = ((uint64_t)maxID << 32) | reg;
     }
     const ir::Register getReg() const {
-      return (ir::Register)(key & 0xFFFF);
+      return (ir::Register)(key & 0xFFFFFFFF);
     }
     int32_t getMaxID() const {
-      return key >> 16;
+      return key >> 32;
     }
     uint64_t key;
   } GenRegIntervalKey;
@@ -126,9 +126,9 @@ namespace gbe
     /*! Allocate the virtual boolean (== flags) registers */
     void allocateFlags(Selection &selection);
     /*! validated flags which contains valid value in the physical flag register */
-    set<uint16_t> validatedFlags;
+    set<uint32_t> validatedFlags;
     /*! validated temp flag register which indicate the flag 0,1 contains which virtual flag register. */
-    uint16_t validTempFlagReg;
+    uint32_t validTempFlagReg;
     /*! validate flag for the current flag user instruction */
     void validateFlag(Selection &selection, SelectionInstruction &insn);
     /*! Allocate the GRF registers */
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index d539937..e166af4 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -132,7 +132,6 @@ namespace gbe
     uint32_t physicalFlag:1; //!< Physical or virtual flag register
     uint32_t flag:1;         //!< Only if physical flag,
     uint32_t subFlag:1;      //!< Only if physical flag
-    uint32_t flagIndex:16;   //!< Only if virtual flag (index of the register)
     uint32_t grfFlag:1;      //!< Only if virtual flag, 0 means we do not need to allocate GRF.
     uint32_t externFlag:1;   //!< Only if virtual flag, 1 means this flag is from external BB.
     uint32_t modFlag:1;      //!< Only if virtual flag, 1 means will modify flag.
@@ -146,6 +145,7 @@ namespace gbe
     uint32_t predicate:4;
     uint32_t inversePredicate:1;
     uint32_t saturate:1;
+    uint32_t flagIndex;   //!< Only if virtual flag (index of the register)
     void chooseNib(int nib) {
       switch (nib) {
         case 0:
@@ -240,7 +240,7 @@ namespace gbe
       float f;
       int32_t d;
       uint32_t ud;
-      uint16_t reg;
+      uint32_t reg;
       int64_t i64;
     } value;
 
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
index 10bd035..6b27e8b 100644
--- a/backend/src/ir/immediate.hpp
+++ b/backend/src/ir/immediate.hpp
@@ -345,7 +345,7 @@ namespace ir {
   }
 
   /*! A value is stored in a per-function vector. This is the index to it */
-  TYPE_SAFE(ImmediateIndex, uint16_t)
+  TYPE_SAFE(ImmediateIndex, uint32_t)
 
 } /* namespace ir */
 } /* namespace gbe */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 039f085..8bd19b6 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -741,7 +741,7 @@ namespace ir {
                                          const Function &fn,
                                          std::string &whyNot)
     {
-      if (UNLIKELY(uint16_t(ID) >= fn.regNum())) {
+      if (UNLIKELY(uint32_t(ID) >= fn.regNum())) {
         whyNot = "Out-of-bound destination register index";
         return false;
       }
@@ -885,8 +885,9 @@ namespace ir {
         return false;
       const RegisterFamily family = getFamily(this->type);
       for (uint32_t srcID = 0; srcID < 2; ++srcID)
-        if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
+        if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false)) {
           return false;
+        }
       return true;
     }
 
@@ -1283,7 +1284,7 @@ namespace ir {
   return HelperIntrospection<CLASS, RefClass>::value == 1;
 
 #define START_INTROSPECTION(CLASS) \
-  static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*2), \
+  static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*4), \
                 "Bad instruction size"); \
   static_assert(offsetof(internal::CLASS, opcode) == 0, \
                 "Bad opcode offset"); \
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 47312f5..37f64af 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -103,7 +103,7 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   /*! Stores instruction internal data and opcode */
-  class ALIGNED(sizeof(uint64_t)*2) InstructionBase
+  class ALIGNED(sizeof(uint64_t)*4) InstructionBase
   {
   public:
     /*! Initialize the instruction from a 8 bytes stream */
@@ -117,7 +117,7 @@ namespace ir {
     /*! Get the instruction opcode */
     INLINE Opcode getOpcode(void) const { return opcode; }
   protected:
-    enum { opaqueSize = sizeof(uint64_t)*2-sizeof(uint8_t) };
+    enum { opaqueSize = sizeof(uint64_t)*4-sizeof(uint8_t) };
     Opcode opcode;               //!< Idendifies the instruction
     char opaque[opaqueSize];     //!< Remainder of it
     GBE_CLASS(InstructionBase);  //!< Use internal allocators
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index ce8bd60..be5f60d 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -111,7 +111,7 @@ namespace ir {
   /*! Register is the position of the index of the register data in the register
    *  file. We enforce type safety with this class
    */
-  TYPE_SAFE(Register, uint16_t)
+  TYPE_SAFE(Register, uint32_t)
   INLINE bool operator< (const Register &r0, const Register &r1) {
     return r0.value() < r1.value();
   }
@@ -119,7 +119,7 @@ namespace ir {
   /*! Tuple is the position of the first register in the tuple vector. We
    *  enforce type safety with this class
    */
-  TYPE_SAFE(Tuple, uint16_t)
+  TYPE_SAFE(Tuple, uint32_t)
 
   /*! A register file allocates and destroys registers. Basically, we will have
    *  one register file per function
@@ -131,7 +131,7 @@ namespace ir {
     INLINE Register append(RegisterFamily family, bool uniform = false) {
       GBE_ASSERTM(regNum() < MAX_INDEX,
                   "Too many defined registers (only 65535 are supported)");
-      const uint16_t index = regNum();
+      const uint32_t index = regNum();
       const RegisterData reg(family, uniform);
       regs.push_back(reg);
       return Register(index);
@@ -157,18 +157,18 @@ namespace ir {
     INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
     /*! Get the register index from the tuple */
     INLINE Register get(Tuple index, uint32_t which) const {
-      return regTuples[uint16_t(index) + which];
+      return regTuples[uint32_t(index) + which];
     }
     /*! Set the register index from the tuple */
     INLINE void set(Tuple index, uint32_t which, Register reg) {
-      regTuples[uint16_t(index) + which] = reg;
+      regTuples[uint32_t(index) + which] = reg;
     }
     /*! Number of registers in the register file */
     INLINE uint32_t regNum(void) const { return regs.size(); }
     /*! Number of tuples in the register file */
     INLINE uint32_t tupleNum(void) const { return regTuples.size(); }
     /*! register and tuple indices are short */
-    enum { MAX_INDEX = 0xffff }; 
+    enum { MAX_INDEX = 0xffffffff };
   private:
     vector<RegisterData> regs;   //!< All the registers together
     vector<Register> regTuples;  //!< Tuples are used for many src / dst
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:39 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:39 +0800
Subject: [Beignet] [PATCH 4/8] GBE: extend backend label to 32 bit.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-4-git-send-email-zhigang.gong@intel.com>

The front end label is still 16 bit. But the auxiliary
label could be larger than that. This is the preparation
to support 32 bit label for both front end and backend.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 24 ++++++++++++------------
 backend/src/backend/gen_insn_selection.hpp |  4 ++--
 backend/src/ir/instruction.hpp             |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 5586468..490525f 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -639,7 +639,7 @@ namespace gbe
     friend class SelectionInstruction;
   private:
     /*! Auxiliary label for if/endif. */ 
-    uint16_t currAuxLabel;
+    uint32_t currAuxLabel;
     bool bHas32X32Mul;
     INLINE ir::LabelIndex newAuxLabel()
     {
@@ -1020,7 +1020,7 @@ namespace gbe
 
   void Selection::Opaque::LABEL(ir::LabelIndex index) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_LABEL, 0, 0);
-    insn->index = uint16_t(index);
+    insn->index = uint32_t(index);
   }
 
   void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
@@ -1038,7 +1038,7 @@ namespace gbe
   int Selection::Opaque::JMPI(Reg src, ir::LabelIndex index, ir::LabelIndex origin) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(index);
+    insn->index = uint32_t(index);
     insn->extra.longjmp = abs(index - origin) > 800;
     return insn->extra.longjmp ? 2 : 1;
   }
@@ -1046,28 +1046,28 @@ namespace gbe
   void Selection::Opaque::BRD(Reg src, ir::LabelIndex jip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BRD, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(jip);
+    insn->index = uint32_t(jip);
   }
 
   void Selection::Opaque::BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BRC, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(jip);
-    insn->index1 = uint16_t(uip);
+    insn->index = uint32_t(jip);
+    insn->index1 = uint32_t(uip);
   }
 
   void Selection::Opaque::IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(jip);
-    insn->index1 = uint16_t(uip);
+    insn->index = uint32_t(jip);
+    insn->index1 = uint32_t(uip);
   }
 
   void Selection::Opaque::ELSE(Reg src, ir::LabelIndex jip, ir::LabelIndex elseLabel) {
 
     SelectionInstruction *insn = this->appendInsn(SEL_OP_ELSE, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(jip);
+    insn->index = uint32_t(jip);
     this->LABEL(elseLabel);
   }
 
@@ -1079,13 +1079,13 @@ namespace gbe
     this->LABEL(this->block->endifLabel);
     SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(this->block->endifLabel);
+    insn->index = uint32_t(this->block->endifLabel);
   }
 
   void Selection::Opaque::WHILE(Reg src, ir::LabelIndex jip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_WHILE, 0, 1);
     insn->src(0) = src;
-    insn->index = uint16_t(jip);
+    insn->index = uint32_t(jip);
   }
 
   void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) {
@@ -1687,7 +1687,7 @@ namespace gbe
         if (this->ctx.getIFENDIFFix() &&
             this->block->insnList.size() != 0 &&
             this->block->insnList.size() % 1000 == 0 &&
-            (uint16_t)this->block->endifLabel != 0) {
+            (uint32_t)this->block->endifLabel != 0) {
           ir::LabelIndex jip = this->block->endifLabel;
           this->ENDIF(GenRegister::immd(0), jip);
           this->push();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 8bffb16..c2c4dae 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -140,9 +140,9 @@ namespace gbe
     /*! Number of sources */
     uint8_t srcNum:4;
     /*! To store various indices */
-    uint16_t index;
+    uint32_t index;
     /*! For BRC/IF to store the UIP */
-    uint16_t index1;
+    uint32_t index1;
     /*! instruction ID used for vector allocation. */
     uint32_t ID;
     /*! Variable sized. Destinations and sources go here */
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 37f64af..09b0148 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -88,7 +88,7 @@ namespace ir {
   std::ostream &operator<< (std::ostream &out, AddressSpace addrSpace);
 
   /*! A label is identified with an unsigned short */
-  TYPE_SAFE(LabelIndex, uint16_t)
+  TYPE_SAFE(LabelIndex, uint32_t)
 
   /*! Function class contains the register file and the register tuple. Any
    *  information related to the registers may therefore require a function
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:40 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:40 +0800
Subject: [Beignet] [PATCH 5/8] GBE: don't type cast register/labelindex to
	integer.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-5-git-send-email-zhigang.gong@intel.com>

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 52 +++++++++++++++---------------
 backend/src/ir/instruction.cpp             |  5 ++-
 backend/src/ir/register.hpp                |  8 ++---
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 490525f..27ed11b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -1020,7 +1020,7 @@ namespace gbe
 
   void Selection::Opaque::LABEL(ir::LabelIndex index) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_LABEL, 0, 0);
-    insn->index = uint32_t(index);
+    insn->index = index.value();
   }
 
   void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
@@ -1038,7 +1038,7 @@ namespace gbe
   int Selection::Opaque::JMPI(Reg src, ir::LabelIndex index, ir::LabelIndex origin) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(index);
+    insn->index = index.value();
     insn->extra.longjmp = abs(index - origin) > 800;
     return insn->extra.longjmp ? 2 : 1;
   }
@@ -1046,28 +1046,28 @@ namespace gbe
   void Selection::Opaque::BRD(Reg src, ir::LabelIndex jip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BRD, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(jip);
+    insn->index = jip.value();
   }
 
   void Selection::Opaque::BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_BRC, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(jip);
-    insn->index1 = uint32_t(uip);
+    insn->index = jip.value();
+    insn->index1 = uip.value();
   }
 
   void Selection::Opaque::IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(jip);
-    insn->index1 = uint32_t(uip);
+    insn->index = jip.value();
+    insn->index1 = uip.value();
   }
 
   void Selection::Opaque::ELSE(Reg src, ir::LabelIndex jip, ir::LabelIndex elseLabel) {
 
     SelectionInstruction *insn = this->appendInsn(SEL_OP_ELSE, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(jip);
+    insn->index = jip.value();
     this->LABEL(elseLabel);
   }
 
@@ -1079,13 +1079,13 @@ namespace gbe
     this->LABEL(this->block->endifLabel);
     SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(this->block->endifLabel);
+    insn->index = this->block->endifLabel.value();
   }
 
   void Selection::Opaque::WHILE(Reg src, ir::LabelIndex jip) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_WHILE, 0, 1);
     insn->src(0) = src;
-    insn->index = uint32_t(jip);
+    insn->index = jip.value();
   }
 
   void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) {
@@ -1687,7 +1687,7 @@ namespace gbe
         if (this->ctx.getIFENDIFFix() &&
             this->block->insnList.size() != 0 &&
             this->block->insnList.size() % 1000 == 0 &&
-            (uint32_t)this->block->endifLabel != 0) {
+            this->block->endifLabel.value() != 0) {
           ir::LabelIndex jip = this->block->endifLabel;
           this->ENDIF(GenRegister::immd(0), jip);
           this->push();
@@ -2001,7 +2001,7 @@ namespace gbe
                 if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
                     dag->isUsed) {
                 sel.curr.physicalFlag = 0;
-                sel.curr.flagIndex = (uint32_t)(insn.getDst(0));
+                sel.curr.flagIndex = insn.getDst(0).value();
                 sel.curr.modFlag = 1;
               }
               sel.MOV(dst, src);
@@ -2209,7 +2209,7 @@ namespace gbe
                    insn.getOpcode() == OP_OR ||
                    insn.getOpcode() == OP_XOR);
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = (uint32_t)(insn.getDst(0));
+        sel.curr.flagIndex = insn.getDst(0).value();
         sel.curr.modFlag = 1;
       }
 
@@ -2782,7 +2782,7 @@ namespace gbe
           if (!sel.isScalarReg(insn.getDst(0)) && sel.regDAG[insn.getDst(0)]->isUsed) {
             sel.curr.modFlag = 1;
             sel.curr.physicalFlag = 0;
-            sel.curr.flagIndex = (uint32_t) insn.getDst(0);
+            sel.curr.flagIndex = insn.getDst(0).value();
           }
           sel.MOV(dst, imm.getIntegerValue() ? GenRegister::immuw(0xffff) : GenRegister::immuw(0));
         break;
@@ -3042,7 +3042,7 @@ namespace gbe
           sel.curr.physicalFlag = 0;
           sel.curr.modFlag = 1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          sel.curr.flagIndex = (uint32_t)alignedFlag;
+          sel.curr.flagIndex = alignedFlag.value();
           sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shiftHReg), GenRegister::immuw(32));
         sel.pop();
 
@@ -3055,7 +3055,7 @@ namespace gbe
             // Only need to consider the tmpH when the addr is not aligned.
             sel.curr.modFlag = 0;
             sel.curr.physicalFlag = 0;
-            sel.curr.flagIndex = (uint32_t)alignedFlag;
+            sel.curr.flagIndex = alignedFlag.value();
             sel.curr.predicate = GEN_PREDICATE_NORMAL;
             sel.SHL(tmpH, tmp[i + 1], shiftH);
             sel.OR(effectData[i], tmpL, tmpH);
@@ -3377,7 +3377,7 @@ namespace gbe
           sel.curr.noMask = 1;
         sel.curr.physicalFlag = 0;
         sel.curr.modFlag = 1;
-        sel.curr.flagIndex = (uint32_t)dst;
+        sel.curr.flagIndex = dst.value();
         sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
         if (type == TYPE_S64 || type == TYPE_U64) {
           GenRegister tmp[3];
@@ -3791,7 +3791,7 @@ namespace gbe
         }
         sel.curr.inversePredicate ^= inverse;
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = (uint32_t) pred;
+        sel.curr.flagIndex = pred.value();
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
         // FIXME in general, if the flag is a uniform flag.
         // we should treat that flag as extern flag, as we
@@ -3914,7 +3914,7 @@ namespace gbe
         // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
         sel.push();
          sel.curr.predicate = GEN_PREDICATE_NORMAL;
-         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(label.value()));
         sel.pop();
       }
       else {
@@ -4204,9 +4204,9 @@ namespace gbe
           // as if there is no backward jump latter, then obviously everything will work fine.
           // If there is backward jump latter, then all the pcip will be updated correctly there.
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = (uint32_t) pred;
+          sel.curr.flagIndex = pred.value();
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.MOV(ip, GenRegister::immuw(dst.value()));
           sel.curr.predicate = GEN_PREDICATE_NONE;
           if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif)
             sel.ENDIF(GenRegister::immd(0), nextLabel);
@@ -4216,7 +4216,7 @@ namespace gbe
         // Update the PcIPs
         const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
         if(insn.getParent()->needEndif)
-          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.MOV(ip, GenRegister::immuw(dst.value()));
 
         if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif) {
           if(insn.getParent()->needEndif && !insn.getParent()->needIf)
@@ -4257,13 +4257,13 @@ namespace gbe
         // block. Next instruction will properly update the IPs of the lanes
         // that actually take the branch
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
-        sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
+        sel.MOV(ip, GenRegister::immuw(next.value()));
         GBE_ASSERT(jip == dst);
         sel.push();
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = (uint32_t) pred;
+          sel.curr.flagIndex = pred.value();
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.MOV(ip, GenRegister::immuw(dst.value()));
           sel.block->endifOffset = -1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
           if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif)
@@ -4280,7 +4280,7 @@ namespace gbe
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         // Update the PcIPs
         if(insn.getParent()->needEndif)
-          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.MOV(ip, GenRegister::immuw(dst.value()));
         sel.block->endifOffset = -1;
         if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif) {
           if(insn.getParent()->needEndif && !insn.getParent()->needIf)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 8bd19b6..583bab5 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -741,7 +741,7 @@ namespace ir {
                                          const Function &fn,
                                          std::string &whyNot)
     {
-      if (UNLIKELY(uint32_t(ID) >= fn.regNum())) {
+      if (UNLIKELY(ID.value() >= fn.regNum())) {
         whyNot = "Out-of-bound destination register index";
         return false;
       }
@@ -885,9 +885,8 @@ namespace ir {
         return false;
       const RegisterFamily family = getFamily(this->type);
       for (uint32_t srcID = 0; srcID < 2; ++srcID)
-        if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false)) {
+        if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
           return false;
-        }
       return true;
     }
 
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index be5f60d..d8df7b0 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -129,8 +129,8 @@ namespace ir {
   public:
     /*! Return the index of a newly allocated register */
     INLINE Register append(RegisterFamily family, bool uniform = false) {
-      GBE_ASSERTM(regNum() < MAX_INDEX,
-                  "Too many defined registers (only 65535 are supported)");
+      GBE_ASSERTM((uint64_t)regNum() < MAX_INDEX,
+                  "Too many defined registers (only 4G are supported)");
       const uint32_t index = regNum();
       const RegisterData reg(family, uniform);
       regs.push_back(reg);
@@ -157,11 +157,11 @@ namespace ir {
     INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
     /*! Get the register index from the tuple */
     INLINE Register get(Tuple index, uint32_t which) const {
-      return regTuples[uint32_t(index) + which];
+      return regTuples[index.value() + which];
     }
     /*! Set the register index from the tuple */
     INLINE void set(Tuple index, uint32_t which, Register reg) {
-      regTuples[uint32_t(index) + which] = reg;
+      regTuples[index.value() + which] = reg;
     }
     /*! Number of registers in the register file */
     INLINE uint32_t regNum(void) const { return regs.size(); }
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:41 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:41 +0800
Subject: [Beignet] [PATCH 6/8] GBE: Extend front label ip to 32 bit on
	demand.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-6-git-send-email-zhigang.gong@intel.com>

If the front end label ip exceed 0xffff, then the backend will
use real DW to represent each block's IP address. This is
a dynamic behaviour according to the actual front end's label
number.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/context.cpp            |  2 +
 backend/src/backend/context.hpp            |  8 +++
 backend/src/backend/gen_context.cpp        | 29 ++++++++--
 backend/src/backend/gen_insn_selection.cpp | 89 ++++++++++++++++++++++++------
 backend/src/backend/gen_insn_selection.hpp |  1 -
 backend/src/backend/program.h              |  1 +
 backend/src/ir/function.cpp                |  4 +-
 backend/src/ir/profile.cpp                 |  2 +
 backend/src/ir/profile.hpp                 |  5 +-
 src/cl_command_queue_gen7.c                | 16 ++++--
 10 files changed, 128 insertions(+), 29 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 5e33ddd..59ccc79 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -354,6 +354,8 @@ namespace gbe
     this->kernel = this->allocateKernel();
     this->kernel->simdWidth = this->simdWidth;
     this->buildArgList();
+    if (fn.labelNum() > 0xffff)
+      this->useDWLabel = true;
     if (usedLabels.size() == 0)
       this->buildUsedLabels();
     if (JIPs.size() == 0)
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 1b3744b..faa7c8a 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -100,6 +100,13 @@ namespace gbe
     /*! Preallocated curbe register set including special registers. */
     map<ir::Register, uint32_t> curbeRegs;
     ir::Register getSurfaceBaseReg(unsigned char bti);
+    /* Indicate whether we should use DW label or W label in backend.*/
+    bool isDWLabel(void) const {
+      return useDWLabel;
+    }
+    uint32_t getMaxLabel(void) const {
+      return this->isDWLabel() ? 0xffffffff : 0xffff;
+    }
   protected:
     /*! Build the instruction stream. Return false if failed */
     virtual bool emitCode(void) = 0;
@@ -140,6 +147,7 @@ namespace gbe
     set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
     JIPMap JIPs;                          //!< Where to jump all labels/branches
     uint32_t simdWidth;                   //!< Number of lanes per HW threads
+    bool useDWLabel;                      //!< false means using u16 label, true means using u32 label.
     map<unsigned char, ir::Register> btiRegMap;
     GBE_CLASS(Context);                   //!< Use custom allocators
   };
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 3fab9c8..13c7664 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -133,18 +133,36 @@ namespace gbe
     return true;
   }
 
+  /* Get proper block ip register according to current label width. */
+  static GenRegister getBlockIP(GenContext &ctx) {
+    GenRegister blockip;
+    if (!ctx.isDWLabel())
+      blockip = ctx.ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    else
+      blockip = ctx.ra->genReg(GenRegister::ud8grf(ir::ocl::dwblockip));
+    return blockip;
+  }
+
+  /* Set current block ip register to a specified constant label value. */
+  static void setBlockIP(GenContext &ctx, GenRegister blockip, uint32_t label) {
+    if (!ctx.isDWLabel())
+      ctx.p->MOV(blockip, GenRegister::immuw(label));
+    else
+      ctx.p->MOV(blockip, GenRegister::immud(label));
+  }
+
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
-    const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    const GenRegister blockip = getBlockIP(*this);
     const GenRegister zero = ra->genReg(GenRegister::uw1grf(ir::ocl::zero));
     const GenRegister one = ra->genReg(GenRegister::uw1grf(ir::ocl::one));
     p->push();
       p->curr.noMask = 1;
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+      setBlockIP(*this, blockip, getMaxLabel());
       p->curr.noMask = 0;
-      p->MOV(blockip, GenRegister::immuw(0));
+      setBlockIP(*this, blockip, 0);
       p->curr.execWidth = 1;
       // FIXME, need to get the final use set of zero/one, if there is no user,
       // no need to generate the following two instructions.
@@ -1808,7 +1826,10 @@ namespace gbe
 
     // We insert the block IP mask first
     using namespace ir::ocl;
-    allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
+    if (!isDWLabel())
+      allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
+    else
+      allocCurbeReg(dwblockip, GBE_CURBE_DW_BLOCK_IP);
     allocCurbeReg(lid0, GBE_CURBE_LOCAL_ID_X);
     allocCurbeReg(lid1, GBE_CURBE_LOCAL_ID_Y);
     allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 27ed11b..e025698 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -633,6 +633,64 @@ namespace gbe
                       SelectionDAG *dag0, SelectionDAG *dag1,
                       GenRegister &src0, GenRegister &src1,
                       ir::Type type, bool &inverse);
+
+    /* Get current block IP register according to label width. */
+    GenRegister getBlockIP() {
+      return ctx.isDWLabel() ? selReg(ir::ocl::dwblockip) : selReg(ir::ocl::blockip);
+    }
+
+    /* Get proper label immediate gen register from label value. */
+    GenRegister getLabelImmReg(uint32_t labelValue) {
+      return ctx.isDWLabel() ? GenRegister::immud(labelValue) : GenRegister::immuw(labelValue);
+    }
+
+    /* Get proper label immediate gen register from label. */
+    GenRegister getLabelImmReg(ir::LabelIndex label) {
+      return getLabelImmReg(label.value());
+    }
+
+    /* Set current label register to a label value. */
+    void setBlockIP(GenRegister blockip, uint32_t labelValue) {
+      if (!ctx.isDWLabel())
+        MOV(GenRegister::retype(blockip, GEN_TYPE_UW), GenRegister::immuw(labelValue));
+      else
+        MOV(GenRegister::retype(blockip, GEN_TYPE_UD), GenRegister::immud(labelValue));
+    }
+
+    /* Generate comparison instruction to compare block ip address and specified label register.*/
+    void cmpBlockIP(uint32_t cond,
+                    GenRegister blockip,
+                    GenRegister labelReg) {
+      if (!ctx.isDWLabel())
+        CMP(cond,
+            GenRegister::retype(blockip, GEN_TYPE_UW),
+            labelReg,
+            GenRegister::retype(GenRegister::null(),
+            GEN_TYPE_UW));
+      else
+        CMP(cond,
+            GenRegister::retype(blockip, GEN_TYPE_UD),
+            labelReg,
+            GenRegister::retype(GenRegister::null(),
+            GEN_TYPE_UD));
+    }
+
+    void cmpBlockIP(uint32_t cond,
+                    GenRegister blockip,
+                    uint32_t labelValue) {
+      if (!ctx.isDWLabel())
+        CMP(cond,
+            GenRegister::retype(blockip, GEN_TYPE_UW),
+            GenRegister::immuw(labelValue),
+            GenRegister::retype(GenRegister::null(),
+            GEN_TYPE_UW));
+      else
+        CMP(cond,
+            GenRegister::retype(blockip, GEN_TYPE_UD),
+            GenRegister::immud(labelValue),
+            GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    }
+
     /*! Use custom allocators */
     GBE_CLASS(Opaque);
     friend class SelectionBlock;
@@ -3860,10 +3918,10 @@ namespace gbe
     {
       using namespace ir;
       const LabelIndex label = insn.getLabelIndex();
-      const GenRegister src0 = sel.selReg(ocl::blockip);
-      const GenRegister src1 = GenRegister::immuw(label);
+      const GenRegister src0 = sel.getBlockIP();
+      const GenRegister src1 = sel.getLabelImmReg(label);
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      GBE_ASSERTM(label < GEN_MAX_LABEL, "We reached the maximum label number which is reserved for barrier handling");
+      GBE_ASSERTM(label < sel.ctx.getMaxLabel(), "We reached the maximum label number which is reserved for barrier handling");
       sel.LABEL(label);
 
       if(!insn.getParent()->needIf)
@@ -3884,8 +3942,7 @@ namespace gbe
       sel.push();
         sel.curr.noMask = 1;
         sel.curr.predicate = GEN_PREDICATE_NONE;
-        sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1,
-                GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+        sel.cmpBlockIP(GEN_CONDITIONAL_LE, src0, src1);
       sel.pop();
 
       if (sel.block->hasBarrier) {
@@ -3895,11 +3952,10 @@ namespace gbe
         // this block, as it will always excute with all lanes activated.
         sel.push();
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
+          sel.setBlockIP(src0, sel.ctx.getMaxLabel());
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.curr.noMask = 1;
-          sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL),
-                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+          sel.cmpBlockIP(GEN_CONDITIONAL_EQ, src0, sel.ctx.getMaxLabel());
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
@@ -3914,7 +3970,7 @@ namespace gbe
         // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
         sel.push();
          sel.curr.predicate = GEN_PREDICATE_NORMAL;
-         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(label.value()));
+         sel.setBlockIP(src0, label.value());
         sel.pop();
       }
       else {
@@ -4191,7 +4247,7 @@ namespace gbe
                            ir::LabelIndex src) const
     {
       using namespace ir;
-      const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+      const GenRegister ip = sel.getBlockIP();
 
       // We will not emit any jump if we must go the next block anyway
       const BasicBlock *curr = insn.getParent();
@@ -4206,7 +4262,7 @@ namespace gbe
           sel.curr.physicalFlag = 0;
           sel.curr.flagIndex = pred.value();
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.MOV(ip, GenRegister::immuw(dst.value()));
+          sel.setBlockIP(ip, dst.value());
           sel.curr.predicate = GEN_PREDICATE_NONE;
           if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif)
             sel.ENDIF(GenRegister::immd(0), nextLabel);
@@ -4216,7 +4272,7 @@ namespace gbe
         // Update the PcIPs
         const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
         if(insn.getParent()->needEndif)
-          sel.MOV(ip, GenRegister::immuw(dst.value()));
+          sel.setBlockIP(ip, dst.value());
 
         if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif) {
           if(insn.getParent()->needEndif && !insn.getParent()->needIf)
@@ -4242,7 +4298,8 @@ namespace gbe
                             ir::LabelIndex src) const
     {
       using namespace ir;
-      const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+      //const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+      const GenRegister ip = sel.getBlockIP();
       const Function &fn = sel.ctx.getFunction();
       const BasicBlock &bb = fn.getBlock(src);
       const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
@@ -4257,13 +4314,13 @@ namespace gbe
         // block. Next instruction will properly update the IPs of the lanes
         // that actually take the branch
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
-        sel.MOV(ip, GenRegister::immuw(next.value()));
+        sel.setBlockIP(ip, next.value());
         GBE_ASSERT(jip == dst);
         sel.push();
           sel.curr.physicalFlag = 0;
           sel.curr.flagIndex = pred.value();
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.MOV(ip, GenRegister::immuw(dst.value()));
+          sel.setBlockIP(ip, dst.value());
           sel.block->endifOffset = -1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
           if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif)
@@ -4280,7 +4337,7 @@ namespace gbe
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         // Update the PcIPs
         if(insn.getParent()->needEndif)
-          sel.MOV(ip, GenRegister::immuw(dst.value()));
+        sel.setBlockIP(ip, dst.value());
         sel.block->endifOffset = -1;
         if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif) {
           if(insn.getParent()->needEndif && !insn.getParent()->needIf)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index c2c4dae..d3f7363 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -44,7 +44,6 @@ namespace gbe
   /*! Translate IR compare to Gen compare */
   uint32_t getGenCompare(ir::Opcode opcode);
 
-  #define GEN_MAX_LABEL 0xFFFF
 
   /*! Selection opcodes properly encoded from 0 to n for fast jump tables
    *  generations
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index dc5662f..4065a17 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -96,6 +96,7 @@ enum gbe_curbe_type {
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
   GBE_CURBE_BLOCK_IP,
+  GBE_CURBE_DW_BLOCK_IP,
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_ZERO,
   GBE_CURBE_ONE,
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 7983778..79dc997 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -136,8 +136,8 @@ namespace ir {
   }
 
   LabelIndex Function::newLabel(void) {
-    GBE_ASSERTM(labels.size() < 0xffff,
-                "Too many labels are defined (65536 only are supported)");
+    GBE_ASSERTM(labels.size() < 0xffffffffull,
+                "Too many labels are defined (4G only are supported)");
     const LabelIndex index(labels.size());
     labels.push_back(NULL);
     return index;
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 4c272bd..ec7ab94 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -43,6 +43,7 @@ namespace ir {
         "zero", "one",
         "retVal", "slm_offset",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "dwblockip",
         "invalid"
     };
 
@@ -86,6 +87,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
     }
 #undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 7259d9f..8f69320 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -71,8 +71,9 @@ namespace ir {
     static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
     static const Register printfbptr = Register(28); // printf buffer address .
     static const Register printfiptr = Register(29); // printf index buffer address.
-    static const Register invalid = Register(30);  // used for valid comparation.
-    static const uint32_t regNum = 31;             // number of special registers
+    static const Register dwblockip = Register(30);  // blockip
+    static const Register invalid = Register(31);  // used for valid comparation.
+    static const uint32_t regNum = 32;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 253c4f2..4adbd2b 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -49,23 +49,27 @@ cl_set_varying_payload(const cl_kernel ker,
   size_t i, j, k, curr = 0;
   int32_t id_offset[3], ip_offset;
   cl_int err = CL_SUCCESS;
+  int32_t dw_ip_offset = -1;
 
   id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
   id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
   id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
   ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
+  if (ip_offset < 0)
+    dw_ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_DW_BLOCK_IP, 0);
+  assert(ip_offset < 0 || dw_ip_offset < 0);
   assert(id_offset[0] >= 0 &&
          id_offset[1] >= 0 &&
          id_offset[2] >= 0 &&
-         ip_offset >= 0);
+         (ip_offset >= 0 || dw_ip_offset >= 0));
 
   TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
   TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
-
   /* 0xffff means that the lane is inactivated */
-  memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz);
+  memset(block_ips, 0xff, sizeof(int16_t)*thread_n*simd_sz);
+
 
   /* Compute the IDs and the block IPs */
   for (k = 0; k < local_wk_sz[2]; ++k)
@@ -84,11 +88,15 @@ cl_set_varying_payload(const cl_kernel ker,
     uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
     uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
     uint16_t *ips  = (uint16_t *) (data + ip_offset);
+    uint32_t *dw_ips  = (uint32_t *) (data + dw_ip_offset);
     for (j = 0; j < simd_sz; ++j, ++curr) {
       ids0[j] = ids[0][curr];
       ids1[j] = ids[1][curr];
       ids2[j] = ids[2][curr];
-      ips[j] = block_ips[curr];
+      if (ip_offset >= 0)
+        ips[j] = block_ips[curr];
+      if (dw_ip_offset >= 0)
+        dw_ips[j] = block_ips[curr];
     }
   }
 
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:42 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:42 +0800
Subject: [Beignet] [PATCH 7/8] GBE: Use actual bti information to determine
	a pointer's addressspace.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-7-git-send-email-zhigang.gong@intel.com>

Due to the private constant buffer support, it introduces private address
space mixed with constant address space some time. And more generic, one
constant address space may be located in private address space in LLVM IR
layer. Such as the following code:
  __kernel ...
  {
    const int2 foo[] = {{0, 1}, {2, 3}};
    int2 data = foo[get_global_id(0) % 2];
  }

The foo is in private address space but we finally will use __constant bti
to access it in Gen backend. The the above code will cause a assertion fail
in gen insturcion selection stage, because it generate a vector loading
instruction on a __constant buffer.

So we should use the actual BTI data to determine one pointer's address space
rather than get it from the LLVM IR layer.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp |  3 ++-
 backend/src/backend/program.h              |  1 +
 backend/src/ir/instruction.cpp             |  1 +
 backend/src/ir/instruction.hpp             |  1 +
 backend/src/llvm/llvm_gen_backend.cpp      | 25 ++++++++++++++++++-------
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index e025698..1d999fa 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3246,7 +3246,8 @@ namespace gbe
       GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
                  insn.getAddressSpace() == MEM_CONSTANT ||
                  insn.getAddressSpace() == MEM_PRIVATE ||
-                 insn.getAddressSpace() == MEM_LOCAL);
+                 insn.getAddressSpace() == MEM_LOCAL ||
+                 insn.getAddressSpace() == MEM_MIXED);
       //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
       const Type type = insn.getValueType();
       const uint32_t elemSize = getByteScatterGatherSize(type);
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 4065a17..554fb16 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -70,6 +70,7 @@ enum gbe_get_arg_info_value {
 #define BTI_MAX_WRITE_IMAGE_ARGS  8
 #define BTI_WORKAROUND_IMAGE_OFFSET 128
 #define BTI_MAX_ID 253
+#define BTI_LOCAL 0xfe
 
 /*! Constant buffer values (ie values to setup in the constant buffer) */
 enum gbe_curbe_type {
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 583bab5..12bc1bf 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1254,6 +1254,7 @@ namespace ir {
       case MEM_LOCAL: return out << "local";
       case MEM_CONSTANT: return out << "constant";
       case MEM_PRIVATE: return out << "private";
+      case MEM_MIXED: return out << "mixed";
       case MEM_INVALID: return out << "invalid";
     };
     return out;
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 09b0148..f7024d4 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -58,6 +58,7 @@ namespace ir {
     MEM_LOCAL,      //!< Local memory (thread group memory)
     MEM_CONSTANT,   //!< Immutable global memory
     MEM_PRIVATE,    //!< Per thread private memory
+    MEM_MIXED,      //!< mixed address space pointer.
     MEM_INVALID
   };
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index ec79628..0487bcb 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -290,6 +290,19 @@ namespace gbe
     return ir::MEM_GLOBAL;
   }
 
+  static INLINE ir::AddressSpace btiToGen(const ir::BTI &bti) {
+    if (bti.count > 1)
+      return ir::MEM_MIXED;
+    uint8_t singleBti = bti.bti[0];
+    switch (singleBti) {
+      case BTI_CONSTANT: return ir::MEM_CONSTANT;
+      case BTI_PRIVATE: return  ir::MEM_PRIVATE;
+      case BTI_LOCAL: return ir::MEM_LOCAL;
+      default: return ir::MEM_GLOBAL;
+    }
+    return ir::MEM_GLOBAL;
+  }
+
   static Constant *extractConstantElem(Constant *CPV, uint32_t index) {
     ConstantVector *CV = dyn_cast<ConstantVector>(CPV);
     GBE_ASSERT(CV != NULL);
@@ -1443,7 +1456,7 @@ namespace gbe
                 incBtiBase();
               break;
               case ir::MEM_LOCAL:
-                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, 0xfe);
+                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, BTI_LOCAL);
                 ctx.getFunction().setUseSLM(true);
               break;
               case ir::MEM_CONSTANT:
@@ -2817,12 +2830,11 @@ namespace gbe
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
 
-    unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
-    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register dst = this->getRegister(&I);
 
     ir::BTI bti;
     gatherBTI(&I, bti);
+    const ir::AddressSpace addrSpace = btiToGen(bti);
     vector<ir::Register> src;
     uint32_t srcNum = 0;
     while(AI != AE) {
@@ -3646,7 +3658,7 @@ namespace gbe
               new_bti = BTI_CONSTANT;
               break;
             case 3:
-              new_bti = 0xfe;
+              new_bti = BTI_LOCAL;
               break;
             default:
               GBE_ASSERT(0 && "address space not unhandled in gatherBTI()\n");
@@ -3740,15 +3752,14 @@ namespace gbe
   template <bool isLoad, typename T>
   INLINE void GenWriter::emitLoadOrStore(T &I)
   {
-    unsigned int llvmSpace = I.getPointerAddressSpace();
     Value *llvmPtr = I.getPointerOperand();
     Value *llvmValues = getLoadOrStoreValue(I);
     Type *llvmType = llvmValues->getType();
     const bool dwAligned = (I.getAlignment() % 4) == 0;
-    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
     const ir::Register ptr = this->getRegister(llvmPtr);
     ir::BTI binding;
     gatherBTI(&I, binding);
+    const ir::AddressSpace addrSpace = btiToGen(binding);
 
     Type *scalarType = llvmType;
     if (!isScalarType(llvmType)) {
@@ -3795,7 +3806,7 @@ namespace gbe
       const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
       const ir::RegisterFamily dataFamily = getFamily(type);
 
-      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
+      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT && addrSpace != ir::MEM_MIXED) {
         // One message is enough here. Nothing special to do
         if (elemNum <= 4) {
           // Build the tuple data in the vector
-- 
1.9.1


From zhigang.gong at intel.com  Tue Mar 31 19:05:43 2015
From: zhigang.gong at intel.com (Zhigang Gong)
Date: Wed,  1 Apr 2015 10:05:43 +0800
Subject: [Beignet] [PATCH 8/8] GBE: refine error handling for private libva
	buffer sharing extension.
In-Reply-To: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
References: <1427853943-23224-1-git-send-email-zhigang.gong@intel.com>
Message-ID: <1427853943-23224-8-git-send-email-zhigang.gong@intel.com>

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 src/cl_mem.c             | 4 ++++
 src/intel/intel_driver.c | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 57d27dd..31eb4c1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1974,6 +1974,10 @@ LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
 
   size_t sz = 0;
   mem->bo = cl_buffer_get_buffer_from_libva(ctx, bo_name, &sz);
+  if (mem->bo == NULL) {
+    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    goto error;
+  }
   mem->size = sz;
 
 exit:
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index ea71cfe..5ed9e13 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -369,6 +369,10 @@ intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t na
   dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
                                              sname,
                                              name);
+  if (bo == NULL) {
+    fprintf(stderr, "intel_bo_gem_create_from_name create \"%s\" bo from name %d failed: %s\n", sname, name, strerror(errno));
+    return NULL;
+  }
   return bo;
 }
 
@@ -668,6 +672,9 @@ cl_buffer intel_share_buffer_from_libva(cl_context ctx,
 
   intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
 
+  if (intel_bo == NULL)
+    return NULL;
+
   if (sz)
     *sz = intel_bo->size;
 
-- 
1.9.1


From chuanbo.weng at intel.com  Tue Mar 31 20:33:34 2015
From: chuanbo.weng at intel.com (Weng, Chuanbo)
Date: Wed, 1 Apr 2015 03:33:34 +0000
Subject: [Beignet] [PATCH 1/3] add benckmark for copy data from buffer
	to image.
In-Reply-To: <894E4BC922C573429354F1EC4342D61C0F5273A7@SHSMSX101.ccr.corp.intel.com>
References: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
 <894E4BC922C573429354F1EC4342D61C0F5273A7@SHSMSX101.ccr.corp.intel.com>
Message-ID: <5A0E318D73C83C40A09BDBBE131796D701A290A9@shsmsx102.ccr.corp.intel.com>

Hi Xionghu,
	First of all, I think you should merge [PATCH 2/3] and [PATCH 3/3] into one patch, because
it's your programming error during your development.
	For other comments, please see my comments below.
	
-----Original Message-----
From: Luo, Xionghu 
Sent: Wednesday, April 01, 2015 8:52
To: Weng, Chuanbo
Cc: beignet at lists.freedesktop.org
Subject: RE: [PATCH 1/3] add benckmark for copy data from buffer to image.

Hi Chuanbo,
Please review this patchset for me.
It adds aligned copy data from buffer to image 2d (only image 2d to buffer before).

Luo Xionghu
Best Regards

-----Original Message-----
From: Luo, Xionghu
Sent: Wednesday, April 1, 2015 8:48 AM
To: beignet at lists.freedesktop.org
Cc: Luo, Xionghu
Subject: [PATCH 1/3] add benckmark for copy data from buffer to image.

From: Luo Xionghu <xionghu.luo at intel.com>

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 benchmark/CMakeLists.txt                     |  1 +
 benchmark/benchmark_copy_buffer_to_image.cpp | 67 ++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 benchmark/benchmark_copy_buffer_to_image.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 7bd61ee..3e43a21 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -15,6 +15,7 @@ set (benchmark_sources
   benchmark_use_host_ptr_buffer.cpp
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
+  benchmark_copy_buffer_to_image.cpp
   benchmark_copy_image_to_buffer.cpp)
 
 
diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp
new file mode 100644
index 0000000..c3eee13
--- /dev/null
+++ b/benchmark/benchmark_copy_buffer_to_image.cpp
@@ -0,0 +1,67 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define IMAGE_BPP 2
+
+double benchmark_copy_buffer_to_image(void)
+{
+  struct timeval start,stop;
+  const size_t w = 960 * 4;
+  const size_t h = 540 * 4;
+  const size_t sz = IMAGE_BPP * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));  memset(&format, 0x0, 
+ sizeof(cl_image_format));
+
+  // Setup image and buffer
+  buf_data[0] = (unsigned short*) malloc(sz);  for (uint32_t i = 0; i < 
+ w*h; ++i) {
+    ((unsigned short*)buf_data[0])[i] = i;//(rand() & 0xffff);  }
You should use random value instead.

+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT16;  desc.image_type 
+ = CL_MEM_OBJECT_IMAGE2D;  desc.image_width = w;  desc.image_height = 
+ h;  desc.image_row_pitch = 0;  OCL_CREATE_BUFFER(buf[0], 
+ CL_MEM_COPY_HOST_PTR, sz, buf_data[0]);  OCL_CREATE_IMAGE(buf[1], 0, 
+ &format, &desc, NULL);
+
+  /*copy image to buffer*/
Modify your comment here.

+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, 1};
+
+  OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  OCL_FINISH();
+  OCL_MAP_BUFFER_GTT(1);
+  /*check result*/
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+    {
+      OCL_ASSERT(((unsigned short*)buf_data[0])[j * w + i] == ((unsigned short*)buf_data[1])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER_GTT(1);
+  gettimeofday(&start,0);
+
+  for (uint32_t i=0; i<100; i++) {
+    OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image);
+
--
1.9.1


From chuanbo.weng at intel.com  Tue Mar 31 20:38:46 2015
From: chuanbo.weng at intel.com (Weng, Chuanbo)
Date: Wed, 1 Apr 2015 03:38:46 +0000
Subject: [Beignet] [PATCH 2/3] Optimization of
 clEnqueueCopyBufferToImage	for 16 aligned case.
In-Reply-To: <1427849307-17120-2-git-send-email-xionghu.luo@intel.com>
References: <1427849307-17120-1-git-send-email-xionghu.luo@intel.com>
 <1427849307-17120-2-git-send-email-xionghu.luo@intel.com>
Message-ID: <5A0E318D73C83C40A09BDBBE131796D701A290C8@shsmsx102.ccr.corp.intel.com>

See my comments below for cl_internal_copy_buffer_to_image_2d_align16.cl.

-----Original Message-----
From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of xionghu.luo at intel.com
Sent: Wednesday, April 01, 2015 8:48
To: beignet at lists.freedesktop.org
Cc: Luo, Xionghu
Subject: [Beignet] [PATCH 2/3] Optimization of clEnqueueCopyBufferToImage for 16 aligned case.

From: Luo Xionghu <xionghu.luo at intel.com>

We can change the image_channel_order to CL_RGBA and image_channel_data_type to CL_UNSIGNED_INT32 for some special case, thus 16 bytes can be read by one work item. Bandwidth is fully used.

Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 src/CMakeLists.txt                                 |  2 +-
 src/cl_context.h                                   |  1 +
 src/cl_mem.c                                       | 43 ++++++++++++++++++----
 .../cl_internal_copy_buffer_to_image_2d_align16.cl | 19 ++++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)  create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index da69532..4e67c71 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra
 cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
-cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d 
+cl_internal_copy_buffer_to_image_2d_align16 
+cl_internal_copy_buffer_to_image_3d
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d diff --git a/src/cl_context.h b/src/cl_context.h index fdbfd2a..249fed8 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -63,6 +63,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c index b41ec14..d3e92f1 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1816,6 +1816,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
+  size_t kn_src_offset;
+  int align16 = 0;
+  size_t align_size = 1;
+  size_t w_saved = 0;
 
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
@@ -1826,24 +1830,47 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
-  fmt.image_channel_order = CL_R;
-  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
   intel_fmt = image->intel_fmt;
   bpp = image->bpp;
+  w_saved = image->w;
+  kn_src_offset = src_offset;
+  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+      ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
+    fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+    align16 = 1;
+    align_size = 16;
+  }
+  else{
+    fmt.image_channel_order = CL_R;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+    align_size = 1;
+  }
   image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = image->w * image->bpp;
-  image->bpp = 1;
-  region0 = region[0] * bpp;
-  origin0 = dst_origin[0] * bpp;
+  image->w = (image->w * image->bpp) / align_size;  image->bpp = 
+ align_size;
+  region0 = (region[0] * bpp) / align_size;
+  origin0 = (dst_origin[0] * bpp) / align_size;  kn_src_offset /= 
+ align_size;
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
   if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(align16){
+      extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
+      extern size_t 
+ cl_internal_copy_buffer_to_image_2d_align16_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+                cl_internal_copy_buffer_to_image_2d_align16_str,
+                (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
+    }
+    else{
       extern char cl_internal_copy_buffer_to_image_2d_str[];
       extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
 
       ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
           cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+    }
   }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
       extern char cl_internal_copy_buffer_to_image_3d_str[];
       extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
@@ -1862,13 +1889,13 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
   cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
 
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
-  image->w = image->w / bpp;
+  image->w = w_saved;
 
   return ret;
 }
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..4e216ea
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__read_only image2d_t image, global uint4* buffer,
The access qualifier of this image should be __write_only.

+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset) {
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
It's 2d image, so remove unnecessary 3rd dimension ID.

+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
Also unnecessary 'k' here.

+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += (k * region1 + j) * region0 + i;
And here.

+  color = buffer[src_offset];
+  write_imageui(image, dst_coord, color); }
+
--
1.9.1

_______________________________________________
Beignet mailing list
Beignet at lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet

From xionghu.luo at intel.com  Tue Mar 31 22:10:45 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Wed,  1 Apr 2015 13:10:45 +0800
Subject: [Beignet] [patch v2 1/2] Optimization of clEnqueueCopyBufferToImage
	for 16 aligned case.
Message-ID: <1427865046-19813-1-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

We can change the image_channel_order to CL_RGBA and
image_channel_data_type to CL_UNSIGNED_INT32 for some special
case, thus 16 bytes can be read by one work item. Bandwidth is
fully used.

v2: merge patch 3 of initializing region0;
remove k dimension in kernel for 2d image.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 src/CMakeLists.txt                                 |  2 +-
 src/cl_context.h                                   |  1 +
 src/cl_mem.c                                       | 44 ++++++++++++++++++----
 .../cl_internal_copy_buffer_to_image_2d_align16.cl | 18 +++++++++
 4 files changed, 56 insertions(+), 9 deletions(-)
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index da69532..4e67c71 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_arra
 cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
-cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
diff --git a/src/cl_context.h b/src/cl_context.h
index fdbfd2a..249fed8 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -63,6 +63,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b41ec14..0a2613d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -1816,6 +1816,10 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   uint32_t intel_fmt, bpp;
   cl_image_format fmt;
   size_t origin0, region0;
+  size_t kn_src_offset;
+  int align16 = 0;
+  size_t align_size = 1;
+  size_t w_saved = 0;
 
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
@@ -1826,24 +1830,48 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(image->base.ctx == buffer->ctx);
 
-  fmt.image_channel_order = CL_R;
-  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
   intel_fmt = image->intel_fmt;
   bpp = image->bpp;
-  image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = image->w * image->bpp;
-  image->bpp = 1;
+  w_saved = image->w;
   region0 = region[0] * bpp;
-  origin0 = dst_origin[0] * bpp;
+  kn_src_offset = src_offset;
+  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
+      ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
+    fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+    align16 = 1;
+    align_size = 16;
+  }
+  else{
+    fmt.image_channel_order = CL_R;
+    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+    align_size = 1;
+  }
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = (image->w * image->bpp) / align_size;
+  image->bpp = align_size;
+  region0 = (region[0] * bpp) / align_size;
+  origin0 = (dst_origin[0] * bpp) / align_size;
+  kn_src_offset /= align_size;
   global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
 
   /* setup the kernel and run. */
   if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(align16){
+      extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
+      extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+                cl_internal_copy_buffer_to_image_2d_align16_str,
+                (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
+    }
+    else{
       extern char cl_internal_copy_buffer_to_image_2d_str[];
       extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
 
       ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
           cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+    }
   }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
       extern char cl_internal_copy_buffer_to_image_3d_str[];
       extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
@@ -1862,13 +1890,13 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
   cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
   cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
   cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
 
   ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
 
   image->intel_fmt = intel_fmt;
   image->bpp = bpp;
-  image->w = image->w / bpp;
+  image->w = w_saved;
 
   return ret;
 }
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..e4cef73
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__read_only image2d_t image, global uint4* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += j * region0 + i;
+  color = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
+
-- 
1.9.1


From xionghu.luo at intel.com  Tue Mar 31 22:10:46 2015
From: xionghu.luo at intel.com (xionghu.luo at intel.com)
Date: Wed,  1 Apr 2015 13:10:46 +0800
Subject: [Beignet] [patch v2 2/2] add benckmark for copy data from buffer to
	image.
In-Reply-To: <1427865046-19813-1-git-send-email-xionghu.luo@intel.com>
References: <1427865046-19813-1-git-send-email-xionghu.luo@intel.com>
Message-ID: <1427865046-19813-2-git-send-email-xionghu.luo@intel.com>

From: Luo Xionghu <xionghu.luo at intel.com>

v2: use random input data; update comments.
Signed-off-by: Luo Xionghu <xionghu.luo at intel.com>
---
 benchmark/CMakeLists.txt                     |  1 +
 benchmark/benchmark_copy_buffer_to_image.cpp | 66 ++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 benchmark/benchmark_copy_buffer_to_image.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 7bd61ee..3e43a21 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -15,6 +15,7 @@ set (benchmark_sources
   benchmark_use_host_ptr_buffer.cpp
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
+  benchmark_copy_buffer_to_image.cpp
   benchmark_copy_image_to_buffer.cpp)
 
 
diff --git a/benchmark/benchmark_copy_buffer_to_image.cpp b/benchmark/benchmark_copy_buffer_to_image.cpp
new file mode 100644
index 0000000..2177cfe
--- /dev/null
+++ b/benchmark/benchmark_copy_buffer_to_image.cpp
@@ -0,0 +1,66 @@
+#include <string.h>
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+#define IMAGE_BPP 2
+
+double benchmark_copy_buffer_to_image(void)
+{
+  struct timeval start,stop;
+  const size_t w = 960 * 4;
+  const size_t h = 540 * 4;
+  const size_t sz = IMAGE_BPP * w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup image and buffer
+  buf_data[0] = (unsigned short*) malloc(sz);
+  for (uint32_t i = 0; i < w*h; ++i) {
+    ((unsigned short*)buf_data[0])[i] = (rand() & 0xffff);
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNSIGNED_INT16;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, buf_data[0]);
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  /*copy buffer to image*/
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, 1};
+
+  OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  OCL_FINISH();
+  OCL_MAP_BUFFER_GTT(1);
+  /*check result*/
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+    {
+      OCL_ASSERT(((unsigned short*)buf_data[0])[j * w + i] == ((unsigned short*)buf_data[1])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER_GTT(1);
+  gettimeofday(&start,0);
+
+  for (uint32_t i=0; i<100; i++) {
+    OCL_CALL (clEnqueueCopyBufferToImage, queue, buf[0], buf[1], 0, origin, region,
+            0, NULL, NULL);
+  }
+  OCL_FINISH();
+
+  gettimeofday(&stop,0);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(&stop, &start, 0);
+
+  return BANDWIDTH(sz * 100, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_copy_buffer_to_image);
-- 
1.9.1