[Libreoffice-commits] core.git: Branch 'feature/formula-core-rework' - 4 commits - sc/source

Haidong Lian haidong at multicorewareinc.com
Wed Jul 24 13:15:01 PDT 2013


 sc/source/core/opencl/formulagroupcl.cxx |  299 ++-
 sc/source/core/opencl/oclkernels.hxx     |  172 -
 sc/source/core/opencl/openclwrapper.cxx  | 2751 ++++++++++++-------------------
 sc/source/core/opencl/openclwrapper.hxx  |  185 --
 4 files changed, 1510 insertions(+), 1897 deletions(-)

New commits:
commit 53b7e512a739b89115343d19aafa20f08ca22770
Author: Haidong Lian <haidong at multicorewareinc.com>
Date:   Wed Jul 24 15:16:55 2013 -0400

    Add support for double in OpenCL kernel.
    
    * modified coding style.
    * merged arithmetic operators together.
    * added support for double in OpenCL kernel.
    * added an environment variable named SC_FLOAT, which, when set it to 1, will
      force to use float in OpenCL kernel. If not set, we will detect GPU, and if
      GPU supports double, use double for kernel, otherwise use float for kernel.
    
    Conflicts:
    	sc/source/core/opencl/openclwrapper.cxx
    	sc/source/core/opencl/openclwrapper.hxx
    
    Change-Id: I7cdec458d72837d3b22ba50c6d28f78797ee0d3b

diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 8de7713..cd0c694 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -104,7 +104,7 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
     size_t rowSize = xGroup->mnLength;
     fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize);
     // The row quantity can be gotten from p2->GetArrayLength()
-    int nCount1 = 0, nCount2 = 0, nCount3 = 0;
+    uint nCount1 = 0, nCount2 = 0, nCount3 = 0;
     int nOclOp = 0;
     double *rResult = NULL; // Point to the output data from GPU
     rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
@@ -115,18 +115,41 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
     }
     memset(rResult,0,rowSize);
     float * fpOclSrcData = NULL; // Point to the input data from CPU
+    double * dpOclSrcData = NULL;
     uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
     uint * npOclEndPos   = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
     float * fpLeftData   = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
     float * fpRightData  = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
                                  // The rightData can't be zero for "/"
+    double * dpLeftData = NULL;
+    double * dpRightData = NULL;
+
+    float * fpSaveData=NULL;            //It is a temp pointer point the preparing memory;
+    float * fpSumProMergeLfData = NULL; //It merge the more col to one col is the left operator
+    float * fpSumProMergeRtData = NULL; //It merge the more col to one col is the right operator
+    double * dpSaveData=NULL;
+    double * dpSumProMergeLfData = NULL;
+    double * dpSumProMergeRtData = NULL;
+    uint * npSumSize=NULL;      //It is a array to save the matix sizt(col *row)
+    int nSumproductSize=0;      //It is the merge array size
+    bool aIsAlloc=false;        //It is a flag to judge the fpSumProMergeLfData existed
+    unsigned int nCountMatix=0; //It is a count to save the calculate times
     static OclCalc ocl_calc;
+    bool isSumProduct=false;
     if(ocl_calc.GetOpenclState())
     {
         // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
         // Don't know which formulae will be used previously, so create buffers for different formulae used probably
-        ocl_calc.CreateBuffer(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
-        ocl_calc.CreateBuffer(fpLeftData,fpRightData,rowSize);
+        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+        {
+            ocl_calc.CreateBuffer64Bits(dpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
+            ocl_calc.CreateBuffer64Bits(dpLeftData,dpRightData,rowSize);
+        }
+        else
+        {
+            ocl_calc.CreateBuffer32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
+            ocl_calc.CreateBuffer32Bits(fpLeftData,fpRightData,rowSize);
+        }
         //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
     }
 ///////////////////////////////////////////////////////////////////////////////////////////
@@ -159,12 +182,55 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
                     if (!p2->IsEndFixed())
                         nRowEnd += i;
                     size_t nRowSize = nRowEnd - nRowStart + 1;
+                    //store the a matix`s rowsize and colsize,use it to calculate the matix`s size
+                    ocl_calc.nFormulaRowSize = nRowSize;
+                    ocl_calc.nFormulaColSize = nColSize;
                     ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0));
                     if(ocl_calc.GetOpenclState())
                     {
                         npOclStartPos[i] = nRowStart; // record the start position
                         npOclEndPos[i]   = nRowEnd;   // record the end position
                     }
+                    int nTempOpcode;
+                    const formula::FormulaToken* pTemp = p;
+                    pTemp=aCode2.Next();
+                    nTempOpcode=pTemp->GetOpCode();
+                    while(1)
+                    {
+                        nTempOpcode=pTemp->GetOpCode();
+                        if(nTempOpcode!=ocOpen && nTempOpcode!=ocPush)
+                            break;
+                         pTemp=aCode2.Next();
+                    }
+                    if((!aIsAlloc) && (ocl_calc.GetOpenclState())&& (nTempOpcode == ocSumProduct))
+                    {
+                        //nColSize * rowSize is the data size , but except the the head of data will use less the nRowSize
+                        //the other all use nRowSize times . and it must aligen so add nRowSize-1.
+                        nSumproductSize = nRowSize+nColSize * rowSize*nRowSize-1;
+                        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+                            ocl_calc.CreateBuffer64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
+                        else
+                            ocl_calc.CreateBuffer32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
+                        aIsAlloc = true;
+                        isSumProduct=true;
+                    }
+                    if(isSumProduct)
+                    {
+                        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+                        {
+                            if(nCountMatix%2==0)
+                                dpSaveData = dpSumProMergeLfData;
+                            else
+                                dpSaveData = dpSumProMergeRtData;
+                        }
+                        else
+                        {
+                            if(nCountMatix%2==0)
+                                fpSaveData = fpSumProMergeLfData;
+                            else
+                                fpSaveData = fpSumProMergeRtData;
+                        }
+                    }
                     for (size_t nCol = 0; nCol < nColSize; ++nCol)
                     {
                         const double* pArray = rArrays[nCol];
@@ -177,9 +243,21 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
                         {
                             for( size_t u=nRowStart; u<=nRowEnd; u++ )
                             {
-                                // Many video cards can't support double type in kernel, so need transfer the double to float
-                                fpOclSrcData[u] = (float)pArray[u];
-                                //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
+                                if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+                                {
+                                    dpOclSrcData[u] = pArray[u];
+                                    //fprintf(stderr,"dpOclSrcData[%d] is %f.\n",u,dpOclSrcData[u]);
+                                    if(isSumProduct)
+                                        dpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = pArray[u];
+                                }
+                                else
+                                {
+                                    // Many video cards can't support double type in kernel, so need transfer the double to float
+                                    fpOclSrcData[u] = (float)pArray[u];
+                                    //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
+                                    if(isSumProduct)
+                                        fpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = (float)pArray[u];
+                                }
                             }
                         }
 
@@ -195,6 +273,11 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
 
                     ScMatrixToken aTok(pMat);
                     aCode2.AddToken(aTok);
+                    if(isSumProduct)
+                    {
+                        npSumSize[nCountMatix/2] =nRowSize*nColSize;
+                        nCountMatix++;
+                    }
                 }
                 break;
                 default:
@@ -214,21 +297,32 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
                 OpCode eOp = pCur->GetOpCode();
                 if(eOp==0)
                 {
-                     if(nCount3%2==0)
-                         fpLeftData[nCount1++] = (float)pCur->GetDouble();
-                     else
-                         fpRightData[nCount2++] = (float)pCur->GetDouble();
-                     nCount3++;
+                    if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+                    {
+                        if(nCount3%2==0)
+                            dpLeftData[nCount1++] = pCur->GetDouble();
+                        else
+                            dpRightData[nCount2++] = pCur->GetDouble();
+                        nCount3++;
+                    }
+                    else
+                    {
+                        if(nCount3%2==0)
+                            fpLeftData[nCount1++] = (float)pCur->GetDouble();
+                        else
+                            fpRightData[nCount2++] = (float)pCur->GetDouble();
+                        nCount3++;
+                    }
                 }
-                else if( eOp!=ocOpen && eOp!=ocClose )
+                else if( eOp!=ocOpen && eOp!=ocClose &&eOp != ocSep)
                     nOclOp = eOp;
 
 //              if(count1>0){//dbg
-//                  fprintf(stderr,"leftData is %f.\n",leftData[count1-1]);
+//                  fprintf(stderr,"leftData is %f.\n",fpLeftData[count1-1]);
 //                  count1--;
 //              }
 //              if(count2>0){//dbg
-//                  fprintf(stderr,"rightData is %f.\n",rightData[count2-1]);
+//                  fprintf(stderr,"rightData is %f.\n",fpRightData[count2-1]);
 //                  count2--;
 //              }
             }
@@ -249,52 +343,99 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
     // For GPU calculation
     if(getenv("SC_GPU")&&ocl_calc.GetOpenclState())
     {
-            fprintf(stderr,"ggGPU flow...\n\n");
-            printf(" oclOp is... %d\n",nOclOp);
-            osl_getSystemTime(&aTimeBefore); //timer
+        fprintf(stderr,"ggGPU flow...\n\n");
+        printf(" oclOp is... %d\n",nOclOp);
+        osl_getSystemTime(&aTimeBefore); //timer
+        if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+        {
+            fprintf(stderr,"ggGPU double precision flow...\n\n");
+            //double precision
             switch(nOclOp)
             {
                 case ocAdd:
-                    ocl_calc.OclHostSignedAdd32Bits(fpLeftData,fpRightData,rResult,nCount1);
+                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedAdd",dpLeftData,dpRightData,rResult,nCount1);
                     break;
                 case ocSub:
-                    ocl_calc.OclHostSignedSub32Bits(fpLeftData,fpRightData,rResult,nCount1);
+                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedSub",dpLeftData,dpRightData,rResult,nCount1);
                     break;
                 case ocMul:
-                    ocl_calc.OclHostSignedMul32Bits(fpLeftData,fpRightData,rResult,nCount1);
+                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedMul",dpLeftData,dpRightData,rResult,nCount1);
                     break;
                 case ocDiv:
-                    ocl_calc.OclHostSignedDiv32Bits(fpLeftData,fpRightData,rResult,nCount1);
+                    ocl_calc.OclHostArithmeticOperator64Bits("oclSignedDiv",dpLeftData,dpRightData,rResult,nCount1);
                     break;
                 case ocMax:
-                    ocl_calc.OclHostFormulaMax32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocMin:
-                    ocl_calc.OclHostFormulaMin32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocAverage:
-                    ocl_calc.OclHostFormulaAverage32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocSum:
-                    //ocl_calc.OclHostFormulaSum(srcData,rangeStart,rangeEnd,rResult,rowSize);
+                    ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaSum",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocCount:
-                    //ocl_calc.OclHostFormulaCount(rangeStart,rangeEnd,rResult,rowSize);
+                    ocl_calc.OclHostFormulaCount64Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocSumProduct:
-                    //ocl_calc.OclHostFormulaSumProduct(srcData,rangeStart,rangeEnd,rResult,rowSize);
+                    ocl_calc.OclHostFormulaSumProduct64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,rResult,rowSize);
                     break;
                 default:
                     fprintf(stderr,"No OpenCL function for this calculation.\n");
                     break;
-            }
-            /////////////////////////////////////////////////////
-            osl_getSystemTime(&aTimeAfter);
-            double diff = getTimeDiff(aTimeAfter, aTimeBefore);
-            //if (diff >= 1.0)
+              }
+        }
+        else
+        {
+            fprintf(stderr,"ggGPU float precision flow...\n\n");
+            //float precision
+            switch(nOclOp)
             {
-                fprintf(stderr,"OpenCL,diff...%f.\n",diff);
-            }
+                case ocAdd:
+                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedAdd",fpLeftData,fpRightData,rResult,nCount1);
+                    break;
+                case ocSub:
+                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedSub",fpLeftData,fpRightData,rResult,nCount1);
+                    break;
+                case ocMul:
+                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedMul",fpLeftData,fpRightData,rResult,nCount1);
+                    break;
+                case ocDiv:
+                    ocl_calc.OclHostArithmeticOperator32Bits("oclSignedDiv",fpLeftData,fpRightData,rResult,nCount1);
+                    break;
+                case ocMax:
+                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMax",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    break;
+                case ocMin:
+                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMin",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    break;
+                case ocAverage:
+                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaAverage",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    break;
+                case ocSum:
+                    ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaSum",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    break;
+                case ocCount:
+                    ocl_calc.OclHostFormulaCount32Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
+                    break;
+                case ocSumProduct:
+                    ocl_calc.OclHostFormulaSumProduct32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,rResult,rowSize);
+                    break;
+                default:
+                    fprintf(stderr,"No OpenCL function for this calculation.\n");
+                    break;
+              }
+        }
+
+        /////////////////////////////////////////////////////
+        osl_getSystemTime(&aTimeAfter);
+        double diff = getTimeDiff(aTimeAfter, aTimeBefore);
+        //if (diff >= 1.0)
+        {
+            fprintf(stderr,"OpenCL,diff...%f.\n",diff);
+        }
 /////////////////////////////////////////////////////
 
 //rResult[i];
@@ -302,17 +443,12 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
 //               fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]);
 //           }
 
-            // Insert the double data, in rResult[i] back into the document
-            rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength);
-        }
-
-        if(rResult)
-            free(rResult);
+        // Insert the double data, in rResult[i] back into the document
+        rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength);
+    }
 
-        if(getenv("SC_GPUSAMPLE")){
-            //fprintf(stderr,"FormulaGroupInterpreter::interpret(),iniflag...%d\n",ocl_calc.GetOpenclState());
-            //ocl_calc.OclTest();//opencl test sample for debug
-        }
+    if(rResult)
+        free(rResult);
 
     return true;
 }
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
index c231dbd..7c9bcaf 100644
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -12,175 +12,119 @@
 
 #ifndef USE_EXTERNAL_KERNEL
 #define KERNEL( ... )# __VA_ARGS__
-
+// Double precision is a default of spreadsheets
+// cl_khr_fp64: Khronos extension
+// cl_amd_fp64: AMD extension
+// use build option outside to define fp_t
 /////////////////////////////////////////////
 const char *kernel_src = KERNEL(
-__kernel void hello(__global uint *buffer)
-
-{
-    size_t idx = get_global_id(0);
-    buffer[idx]=idx;
-}
-
-__kernel void oclformula(__global float *data,
-                       const uint type)
-{
-    const unsigned int i = get_global_id(0);
-
-    switch (type)
-    {
-        case 0:          //MAX
-        {
-            //printf("%i ?%f>%f\n",i,data[2*i],data[2*i+1]);
-            if(data[2*i]>data[2*i+1])
-                data[i] = data[2*i];
-            else
-                data[i] = data[2*i+1];
-            break;
-        }
-        case 1:          //MIN
-        {
-            //printf("%d ?%d<%d\n",i,data[2*i],data[2*i+1]);
-            if(data[2*i]<data[2*i+1])
-                data[i] = data[2*i];
-            else
-                data[i] = data[2*i+1];
-            break;
-        }
-        case 2:          //SUM
-        case 3:          //AVG
-        {
-            //printf("%d %d+%d\n",i,data[2*i],data[2*i+1]);
-            data[i] = data[2*i] + data[2*i+1];
-            break;
-        }
-        default:
-            break;
+\n#ifdef KHR_DP_EXTENSION\n
+\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
+\n#elif AMD_DP_EXTENSION\n
+\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
+\n#else\n
+\n#endif\n
 
-    }
-}
-
-
-__kernel void oclSignedAdd(__global float *ltData,__global float *rtData,__global float *otData)
+__kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
     const unsigned int id = get_global_id(0);
     otData[id] = ltData[id] + rtData[id];
 }
 
 
-__kernel void oclSignedSub(__global float *ltData,__global float *rtData,__global float *otData)
+__kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
     const unsigned int id = get_global_id(0);
     otData[id] = ltData[id] - rtData[id];
-
 }
 
-__kernel void oclSignedMul(__global float *ltData,__global float *rtData,__global float *otData)
+__kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
     int id = get_global_id(0);
     otData[id] =ltData[id] * rtData[id];
 }
 
-__kernel void oclSignedDiv(__global float *ltData,__global float *rtData,__global float *otData)
+__kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
 {
     const unsigned int id = get_global_id(0);
     otData[id] = ltData[id] / rtData[id];
 }
 
-__kernel void oclFormulaMin(__global float *input,__global int *start,__global int *end,__global float *output)
+__kernel void oclFormulaMin(__global fp_t *input,__global int *start,__global int *end,__global fp_t *output)
 {
     const unsigned int id = get_global_id(0);
-    int i=0;
     unsigned int startFlag = start[id];
     unsigned int endFlag = end[id];
-    float min = input[startFlag];
-    for(i=startFlag;i<=endFlag;i++)
+    fp_t min = input[startFlag];
+    for(int i=startFlag;i<=endFlag;i++)
     {
         if(input[i]<min)
             min = input[i];
     }
     output[id] = min;
-
 }
 
-__kernel void oclFormulaMax(__global float *input,__global int *start,__global int *end,__global float *output)
+__kernel void oclFormulaMax(__global fp_t *input,__global int *start,__global int *end,__global fp_t *output)
 {
     const unsigned int id = get_global_id(0);
-    int i=0;
     unsigned int startFlag = start[id];
     unsigned int endFlag = end[id];
-    float max = input[startFlag];
-    for(i=startFlag;i<=endFlag;i++)
+    fp_t max = input[startFlag];
+    for(int i=startFlag;i<=endFlag;i++)
     {
         if(input[i]>max)
             max = input[i];
     }
     output[id] = max;
-
 }
 //Sum
-__kernel void oclFormulaSum(__global float *input,__global int *start,__global int *end,__global float *output)
+__kernel void oclFormulaSum(__global fp_t *input,__global int *start,__global int *end,__global fp_t *output)
 {
     const unsigned int nId = get_global_id(0);
-    float fSum = 0.0f;
+    fp_t fSum = 0.0;
     for(int i = start[nId]; i<=end[nId]; i++)
         fSum += input[i];
     output[nId] = fSum ;
 }
 //Count
-__kernel void oclFormulaCount(__global int *start,__global int *end,__global float *output)
+__kernel void oclFormulaCount(__global int *start,__global int *end,__global fp_t *output)
 {
     const unsigned int nId = get_global_id(0);
     output[nId] = end[nId] - start[nId] + 1;
 }
 
-__kernel void oclFormulaAverage(__global float *input,__global int *start,__global int *end,__global float *output)
+__kernel void oclFormulaAverage(__global fp_t *input,__global int *start,__global int *end,__global fp_t *output)
 {
     const unsigned int id = get_global_id(0);
-    int i=0;
-    float sum=0;
-    for(i = start[id];i<=end[id];i++)
+    fp_t sum=0.0;
+    for(int i = start[id];i<=end[id];i++)
         sum += input[i];
     output[id] = sum / (end[id]-start[id]+1);
 }
 
 //Sumproduct
-__kernel void oclFormulaSumproduct(__global float *firstCol,__global float *secondCol,__global int *start,__global int *end,__global float *output)
+__kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global int* npSumSize,__global fp_t *output,uint nMatixSize)
 {
-    const int nId = get_global_id(0);
-    int nCount     = start[nId] - end[nId] + 1;
-    int nStartA  = start[nId*2];
-    int nStartB  = start[nId*2+1];
-    for(int i = 0; i<nCount; i++)
-        output[nId] += firstCol[nStartA+i]*secondCol[nStartB+i];
+    const unsigned int id = get_global_id(0);
+    unsigned int nSumSize = npSumSize[id];
+    fp_t fSum = 0.0;
+    for(int i=0;i<nSumSize;i++)
+        fSum += firstCol[i + nMatixSize * id];
+    output[id] = fSum;
 }
 
-__kernel void oclFormulaMinverse(__global float *data,
-                       const uint type)
+__kernel void oclFormulaMinverse(__global fp_t *data, const uint type)
 {
 
 }
 
-// Double precision is a requirement of spreadsheets
-// cl_khr_fp64: Khronos extension
-// cl_amd_fp64: AMD extension
-\n#if 0 \n
-\n#if defined(cl_khr_fp64) \n
-\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n
-\n#elif defined(cl_amd_fp64) \n
-\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n
-\n#endif \n
-\ntypedef double fp_t; \n
-\n#else \n
-\ntypedef float fp_t; \n
-\n#endif \n
 
 __kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output)
 {
     const unsigned int id = get_global_id(0);
 
     // Average
-    fp_t fSum = 0.0f;
+    fp_t fSum = 0.0;
     for(int i = start; i < end; i++)
         fSum += values[i];
     fp_t fVal = fSum/(end-start);
@@ -194,7 +138,7 @@ __kernel void oclMaxDelta(__global fp_t *values, __global fp_t *subtract, uint s
     const unsigned int id = get_global_id(0);
 
     // Max
-    float fMaxVal = values[start];
+    fp_t fMaxVal = values[start];
     for(int i=start+1;i < end;i++)
     {
         if(values[i]>fMaxVal)
@@ -210,7 +154,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s
     const unsigned int id = get_global_id(0);
 
     // Min
-    float fMinVal = values[start];
+    fp_t fMinVal = values[start];
     for(int i=start+1;i < end;i++)
     {
         if(values[i]<fMinVal)
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 03f4228..3d165a8 100644
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -28,18 +28,30 @@ int OpenclDevice::isInited =0;
 #define OPENCL_DLL_NAME "opencllo.dll"
 #define OCLERR -1
 #define OCLSUCCESS 1
+
+#define TRUE 1
+#define FALSE 0
+
+#define OCL_INFO(str) \
+    printf("[OCL_INFO] %s\n",str);
+#define OCL_ERROR(str) \
+    fprintf(stderr,"[OCL_ERROR] %s\n",str);
+#define OCL_CHECK(value1,value2,str) \
+    if(value1!=value2) \
+        fprintf(stderr,"[OCL_ERROR] %s\n",str);
+
 HINSTANCE HOpenclDll = NULL;
-    void *OpenclDll = NULL;
+void * OpenclDll = NULL;
 
 int OpenclDevice::LoadOpencl()
 {
     //fprintf(stderr, " LoadOpenclDllxx... \n");
-    OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
-    OpenclDll = LoadLibrary(OPENCL_DLL_NAME);
-    if (!static_cast<HINSTANCE>(OpenclDll))
+    OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
+    OpenclDll = LoadLibrary( OPENCL_DLL_NAME );
+    if ( !static_cast<HINSTANCE>( OpenclDll ) )
     {
         fprintf(stderr, " Load opencllo.dll failed! \n");
-        FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
+        FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
         return OCLERR;
     }
     fprintf(stderr, " Load opencllo.dll successfully!\n");
@@ -49,26 +61,27 @@ int OpenclDevice::LoadOpencl()
 void OpenclDevice::FreeOpenclDll()
 {
     fprintf(stderr, " Free opencllo.dll ... \n");
-    if(!static_cast<HINSTANCE>(OpenclDll))
-        FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
+    if ( !static_cast<HINSTANCE>( OpenclDll ) )
+        FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
 }
 #endif
 
 int OpenclDevice::InitEnv()
 {
 #ifdef SAL_WIN32
-    while(1)
+    while( 1 )
     {
-        if(1==LoadOpencl())
-        break;
+        if( 1 == LoadOpencl() )
+            break;
     }
 #endif
-    InitOpenclRunEnv(0,NULL);
+    InitOpenclRunEnv( 0 );
     return 1;
 }
 
-int OpenclDevice::ReleaseOpenclRunEnv() {
-    ReleaseOpenclEnv(&gpuEnv);
+int OpenclDevice::ReleaseOpenclRunEnv()
+{
+    ReleaseOpenclEnv( &gpuEnv );
 #ifdef SAL_WIN32
     FreeOpenclDll();
 #endif
@@ -76,38 +89,36 @@ int OpenclDevice::ReleaseOpenclRunEnv() {
 }
 ///////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////
-inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName)
+inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
 {
-    strcpy(gpuEnv.mArrykernelNames[kCount], kName);
+    strcpy( gpuEnv.mArrykernelNames[kCount], kName );
     gpuEnv.mnKernelCount++;
     return 0;
 }
 
 int OpenclDevice::RegistOpenclKernel()
 {
-    if (!gpuEnv.mnIsUserCreated)
-        memset(&gpuEnv, 0, sizeof(gpuEnv));
+    if ( !gpuEnv.mnIsUserCreated )
+        memset( &gpuEnv, 0, sizeof(gpuEnv) );
 
     gpuEnv.mnFileCount = 0; //argc;
     gpuEnv.mnKernelCount = 0UL;
 
-    AddKernelConfig(0, (const char*) "hello");
-    AddKernelConfig(1, (const char*) "oclformula");
-    AddKernelConfig(2, (const char*) "oclFormulaMin");
-    AddKernelConfig(3, (const char*) "oclFormulaMax");
-    AddKernelConfig(4, (const char*) "oclFormulaSum");
-    AddKernelConfig(5, (const char*) "oclFormulaCount");
-    AddKernelConfig(6, (const char*) "oclFormulaAverage");
-    AddKernelConfig(7, (const char*) "oclFormulaSumproduct");
-    AddKernelConfig(8, (const char*) "oclFormulaMinverse");
-
-    AddKernelConfig(9,  (const char*) "oclSignedAdd");
-    AddKernelConfig(10, (const char*) "oclSignedSub");
-    AddKernelConfig(11, (const char*) "oclSignedMul");
-    AddKernelConfig(12, (const char*) "oclSignedDiv");
-    AddKernelConfig(13, (const char*) "oclAverageDelta");
-    AddKernelConfig(14, (const char*) "OclMaxDelta");
-    AddKernelConfig(15, (const char*) "OclMinDelta");
+    AddKernelConfig( 1, (const char*) "oclFormulaMin" );
+    AddKernelConfig( 2, (const char*) "oclFormulaMax" );
+    AddKernelConfig( 3, (const char*) "oclFormulaSum" );
+    AddKernelConfig( 4, (const char*) "oclFormulaCount" );
+    AddKernelConfig( 5, (const char*) "oclFormulaAverage" );
+    AddKernelConfig( 6, (const char*) "oclFormulaSumproduct" );
+    AddKernelConfig( 7, (const char*) "oclFormulaMinverse" );
+
+    AddKernelConfig( 8, (const char*) "oclSignedAdd" );
+    AddKernelConfig( 9, (const char*) "oclSignedSub" );
+    AddKernelConfig( 10, (const char*) "oclSignedMul" );
+    AddKernelConfig( 11, (const char*) "oclSignedDiv" );
+    AddKernelConfig( 12, (const char*) "oclAverageDelta" );
+    AddKernelConfig( 13, (const char*) "oclMaxDelta" );
+    AddKernelConfig( 14, (const char*) "oclMinDelta" );
 
     return 0;
 }
@@ -122,28 +133,36 @@ OpenclDevice::~OpenclDevice()
     //ReleaseOpenclRunEnv();
 }
 
-int OpenclDevice::SetKernelEnv(KernelEnv *envInfo)
+int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
 {
-    envInfo->mpkContext  = gpuEnv.mpContext;
+    envInfo->mpkContext = gpuEnv.mpContext;
     envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
-    envInfo->mpkProgram  = gpuEnv.mpArryPrograms[0];
+    envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
 
     return 1;
 }
 
-int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName)
+int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName )
 {
     //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
     int kCount;
-    for(kCount=0; kCount < gpuEnv.mnKernelCount; kCount++) {
-        if(strcasecmp(kernelName, gpuEnv.mArrykernelNames[kCount]) == 0) {
+    int nFlag = 0;
+    for ( kCount=0; kCount < gpuEnv.mnKernelCount; kCount++ )
+    {
+        if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[kCount]) == 0 )
+        {
+            nFlag = 1;
             printf("match %s kernel right\n",kernelName);
             break;
         }
     }
+    if ( !nFlag )
+    {
+        printf("can't find kernel: %s\n",kernelName);
+    }
     envInfo->mpkKernel = gpuEnv.mpArryKernels[kCount];
-    strcpy(envInfo->mckKernelName, kernelName);
-    if (envInfo == (KernelEnv *) NULL)
+    strcpy( envInfo->mckKernelName, kernelName );
+    if ( envInfo == (KernelEnv *) NULL )
     {
         printf("get err func and env\n");
         return 0;
@@ -151,33 +170,36 @@ int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName)
     return 1;
 }
 
-int OpenclDevice::ConvertToString(const char *filename, char **source)
+int OpenclDevice::ConvertToString( const char *filename, char **source )
 {
     int file_size;
     size_t result;
     FILE *file = NULL;
     file_size = 0;
     result = 0;
-    file = fopen(filename, "rb+");
+    file = fopen( filename, "rb+" );
     printf("open kernel file %s.\n",filename);
 
-    if (file != NULL) {
+    if ( file != NULL )
+    {
         printf("Open ok!\n");
-        fseek(file, 0, SEEK_END);
+        fseek( file, 0, SEEK_END );
 
-        file_size = ftell(file);
-        rewind(file);
-        *source = (char*) malloc(file_size + 1);
-        if (*source == (char*) NULL) {
+        file_size = ftell( file );
+        rewind( file );
+        *source = (char*) malloc( sizeof(char) * file_size + 1 );
+        if ( *source == (char*) NULL )
+        {
             return 0;
         }
         result = fread(*source, 1, file_size, file);
-        if (result != (size_t) file_size) {
-            free(*source);
+        if ( result != (size_t) file_size )
+        {
+            free( *source );
             return 0;
         }
         (*source)[file_size] = '\0';
-        fclose(file);
+        fclose( file );
 
         return 1;
     }
@@ -185,123 +207,134 @@ int OpenclDevice::ConvertToString(const char *filename, char **source)
     return 0;
 }
 
-int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle)
+int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
 {
     unsigned int i = 0;
-    cl_int status;
+    cl_int clStatus;
+    int status = 0;
     char *str = NULL;
     FILE *fd = NULL;
     cl_uint numDevices=0;
-    status = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
-                            CL_DEVICE_TYPE_ALL, // device_type
-                            0, // num_entries
-                            NULL, // devices ID
-                            &numDevices);
-    for (i = 0; i <numDevices; i++) {
+    clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+                              CL_DEVICE_TYPE_GPU, // device_type
+                              0, // num_entries
+                              NULL, // devices ID
+                              &numDevices);
+    for ( i = 0; i < numDevices; i++ )
+    {
         char fileName[256] = { 0 }, cl_name[128] = { 0 };
-        if (gpuEnv.mpArryDevsID[i] != 0) {
+        if ( gpuEnv.mpArryDevsID[i] != 0 )
+        {
             char deviceName[1024];
-            status = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,sizeof(deviceName), deviceName, NULL);
-            CHECK_OPENCL(status);
-            str = (char*) strstr(clFileName, (char*) ".cl");
-            memcpy(cl_name, clFileName, str - clFileName);
+            clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
+            CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+            str = (char*) strstr( clFileName, (char*) ".cl" );
+            memcpy( cl_name, clFileName, str - clFileName );
             cl_name[str - clFileName] = '\0';
-            sprintf(fileName, "./%s-%s.bin", cl_name, deviceName);
-            fd = fopen(fileName, "rb");
-            status = (fd != NULL) ? 1 : 0;
-            }
+            sprintf( fileName, "./%s-%s.bin", cl_name, deviceName );
+            fd = fopen( fileName, "rb" );
+            status = ( fd != NULL ) ? 1 : 0;
         }
-        if (fd != NULL) {
-            *fhandle = fd;
-            }
-
-        return status;
+    }
+    if ( fd != NULL )
+    {
+        *fhandle = fd;
+    }
+    return status;
 
 }
 
-int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
-        size_t numBytes)
+int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
 {
     FILE *output = NULL;
-    output = fopen(fileName, "wb");
-    if (output == NULL) {
+    output = fopen( fileName, "wb" );
+    if ( output == NULL )
+    {
         return 0;
     }
 
-    fwrite(birary, 1, numBytes, output);
-    fclose(output);
+    fwrite( birary, sizeof(char), numBytes, output );
+    fclose( output );
 
     return 1;
 
 }
 
-int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
-                                             const char * clFileName)
+int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
 {
-     unsigned int i = 0;
-    cl_int status;
-    size_t *binarySizes;
-    cl_uint numDevices;
+    unsigned int i = 0;
+    cl_int clStatus;
+    size_t *binarySizes, numDevices;
     cl_device_id *mpArryDevsID;
     char **binaries, *str = NULL;
 
-    status = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
-            sizeof(numDevices), &numDevices, NULL);
-    CHECK_OPENCL(status)
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
+                   sizeof(numDevices), &numDevices, NULL );
+    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
 
-    mpArryDevsID = (cl_device_id*) malloc(sizeof(cl_device_id) * numDevices);
-    if (mpArryDevsID == NULL) {
+    mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
+    if ( mpArryDevsID == NULL )
+    {
         return 0;
     }
     /* grab the handles to all of the devices in the program. */
-    status = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
-            sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL);
-    CHECK_OPENCL(status)
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
+                   sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
+    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
 
     /* figure out the sizes of each of the binaries. */
-    binarySizes = (size_t*) malloc(sizeof(size_t) * numDevices);
+    binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
 
-    status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
-            sizeof(size_t) * numDevices, binarySizes, NULL);
-    CHECK_OPENCL(status)
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
+                   sizeof(size_t) * numDevices, binarySizes, NULL );
+    CHECK_OPENCL( clStatus, "clGetProgramInfo" );
 
     /* copy over all of the generated binaries. */
-    binaries = (char**) malloc(sizeof(char *) * numDevices);
-    if (binaries == NULL) {
+    binaries = (char**) malloc( sizeof(char *) * numDevices );
+    if ( binaries == NULL )
+    {
         return 0;
     }
 
-    for (i = 0; i < numDevices; i++) {
-        if (binarySizes[i] != 0) {
-            binaries[i] = (char*) malloc(binarySizes[i]);
-            if (binaries[i] == NULL) {
+    for ( i = 0; i < numDevices; i++ )
+    {
+        if ( binarySizes[i] != 0 )
+        {
+            binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
+            if ( binaries[i] == NULL )
+            {
                 return 0;
             }
-        } else {
+        }
+        else
+        {
             binaries[i] = NULL;
         }
     }
 
-    status = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
-            sizeof(char *) * numDevices, binaries, NULL);
-    CHECK_OPENCL(status)
+    clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
+                   sizeof(char *) * numDevices, binaries, NULL );
+    CHECK_OPENCL(clStatus,"clGetProgramInfo");
 
     /* dump out each binary into its own separate file. */
-    for (i = 0; i < numDevices; i++) {
+    for ( i = 0; i < numDevices; i++ )
+    {
         char fileName[256] = { 0 }, cl_name[128] = { 0 };
 
-        if (binarySizes[i] != 0) {
+        if ( binarySizes[i] != 0 )
+        {
             char deviceName[1024];
-            status = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
-                    sizeof(deviceName), deviceName, NULL);
-            CHECK_OPENCL(status)
+            clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
+                           sizeof(deviceName), deviceName, NULL);
+            CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
 
-            str = (char*) strstr(clFileName, (char*) ".cl");
-            memcpy(cl_name, clFileName, str - clFileName);
+            str = (char*) strstr( clFileName, (char*) ".cl" );
+            memcpy( cl_name, clFileName, str - clFileName );
             cl_name[str - clFileName] = '\0';
-            sprintf(fileName, "./%s-%s.bin", cl_name, deviceName);
+            sprintf( fileName, "./%s-%s.bin", cl_name, deviceName );
 
-            if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) {
+            if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
+            {
                 printf("opencl-wrapper: write binary[%s] failds\n", fileName);
                 return 0;
             } //else
@@ -310,110 +343,121 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
     }
 
     // Release all resouces and memory
-    for (i = 0; i < numDevices; i++) {
-        if (binaries[i] != NULL) {
-            free(binaries[i]);
+    for ( i = 0; i < numDevices; i++ )
+    {
+        if ( binaries[i] != NULL )
+        {
+            free( binaries[i] );
             binaries[i] = NULL;
         }
     }
 
-    if (binaries != NULL) {
-        free(binaries);
+    if ( binaries != NULL )
+    {
+        free( binaries );
         binaries = NULL;
     }
 
-    if (binarySizes != NULL) {
-        free(binarySizes);
+    if ( binarySizes != NULL )
+    {
+        free( binarySizes );
         binarySizes = NULL;
     }
 
-    if (mpArryDevsID != NULL) {
-        free(mpArryDevsID);
+    if ( mpArryDevsID != NULL )
+    {
+        free( mpArryDevsID );
         mpArryDevsID = NULL;
     }
     return 1;
 }
 
-int OpenclDevice::InitOpenclAttr(OpenCLEnv * env)
+int OpenclDevice::InitOpenclAttr( OpenCLEnv * env )
 {
-    if (gpuEnv.mnIsUserCreated)
+    if ( gpuEnv.mnIsUserCreated )
         return 1;
 
-    gpuEnv.mpContext    = env->mpOclContext;
+    gpuEnv.mpContext = env->mpOclContext;
     gpuEnv.mpPlatformID = env->mpOclPlatformID;
-    gpuEnv.mpDevID        = env->mpOclDevsID;
-    gpuEnv.mpCmdQueue    = env->mpOclCmdQueue;
+    gpuEnv.mpDevID = env->mpOclDevsID;
+    gpuEnv.mpCmdQueue = env->mpOclCmdQueue;
 
     gpuEnv.mnIsUserCreated = 1;
 
     return 0;
 }
 
-int OpenclDevice::CreateKernel(char * kernelname, KernelEnv * env)
+int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env )
 {
-    int status;
+    int clStatus;
 
-    env->mpkKernel   = clCreateKernel(gpuEnv.mpArryPrograms[0], kernelname, &status);
-    env->mpkContext  = gpuEnv.mpContext;
+    env->mpkKernel = clCreateKernel( gpuEnv.mpArryPrograms[0], kernelname, &clStatus );
+    env->mpkContext = gpuEnv.mpContext;
     env->mpkCmdQueue = gpuEnv.mpCmdQueue;
-    return status != CL_SUCCESS ? 1 : 0;
+    return clStatus != CL_SUCCESS ? 1 : 0;
 }
 
-int OpenclDevice::ReleaseKernel(KernelEnv * env)
+int OpenclDevice::ReleaseKernel( KernelEnv * env )
 {
-    int status = clReleaseKernel(env->mpkKernel);
-    return status != CL_SUCCESS ? 1 : 0;
+    int clStatus = clReleaseKernel( env->mpkKernel );
+    return clStatus != CL_SUCCESS ? 1 : 0;
 }
 
-int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo)
+int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
 {
     int i = 0;
-    int status = 0;
+    int clStatus = 0;
 
-    if (!isInited) {
+    if ( !isInited )
+    {
         return 1;
     }
 
-    for (i = 0; i < gpuEnv.mnFileCount; i++) {
-        if (gpuEnv.mpArryPrograms[i]) {
-            status = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
-            CHECK_OPENCL(status)
+    for ( i = 0; i < gpuEnv.mnFileCount; i++ )
+    {
+        if ( gpuEnv.mpArryPrograms[i] )
+        {
+            clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
+            CHECK_OPENCL( clStatus, "clReleaseProgram" );
             gpuEnv.mpArryPrograms[i] = NULL;
         }
     }
-    if (gpuEnv.mpCmdQueue) {
-        clReleaseCommandQueue(gpuEnv.mpCmdQueue);
+    if ( gpuEnv.mpCmdQueue )
+    {
+        clReleaseCommandQueue( gpuEnv.mpCmdQueue );
         gpuEnv.mpCmdQueue = NULL;
     }
-    if (gpuEnv.mpContext) {
-        clReleaseContext(gpuEnv.mpContext);
+    if ( gpuEnv.mpContext )
+    {
+        clReleaseContext( gpuEnv.mpContext );
         gpuEnv.mpContext = NULL;
     }
     isInited = 0;
     gpuInfo->mnIsUserCreated = 0;
-    free(gpuInfo->mpArryDevsID);
+    free( gpuInfo->mpArryDevsID );
     return 1;
 }
 
-int OpenclDevice::RunKernelWrapper(cl_kernel_function function,
-                                   const char * kernelName, void **usrdata)
+int OpenclDevice::RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata )
 {
     printf("oclwrapper:RunKernel_wrapper...\n");
-    if (RegisterKernelWrapper(kernelName, function) != 1) {
-        fprintf(stderr,
-                "Error:RunKernel_wrapper:RegisterKernelWrapper fail!\n");
+    if ( RegisterKernelWrapper( kernelName, function ) != 1 )
+    {
+        fprintf(stderr, "Error:RunKernel_wrapper:RegisterKernelWrapper fail!\n");
         return -1;
     }
-    return (RunKernel(kernelName, usrdata));
+    return ( RunKernel( kernelName, usrdata ) );
 }
 
-int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached,
-                                    const char * clFileName)
+int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
 {
     int i;
-    for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
-        if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
-            if (gpuEnvCached->mpArryPrograms[i] != NULL) {
+    for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
+    {
+        if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
+        {
+            if ( gpuEnvCached->mpArryPrograms[i] != NULL )
+            {
                 return 1;
             }
         }
@@ -422,8 +466,9 @@ int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached,
     return 0;
 }
 
-int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
-    cl_int status;
+int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
+{
+    cl_int clStatus;
     size_t length;
     char *buildLog = NULL, *binary;
     const char *source;
@@ -434,7 +479,8 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
     FILE *fd, *fd1;
     const char* filename = "kernel.cl";
     fprintf(stderr, "CompileKernelFile ... \n");
-    if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
+    if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
+    {
         return 1;
     }
 
@@ -442,133 +488,156 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
 
     source = kernel_src;
 
-    source_size[0] = strlen(source);
+    source_size[0] = strlen( source );
     binaryExisted = 0;
-    if ((binaryExisted = BinaryGenerated(filename, &fd)) == 1) {
-        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
-                sizeof(numDevices), &numDevices, NULL);
-        CHECK_OPENCL(status)
+    if ( ( binaryExisted = BinaryGenerated( filename, &fd ) ) == 1 )
+    {
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
+                       sizeof(numDevices), &numDevices, NULL );
+        CHECK_OPENCL( clStatus, "clGetContextInfo" );
 
-        mpArryDevsID = (cl_device_id*) malloc(sizeof(cl_device_id) * numDevices);
-        if (mpArryDevsID == NULL) {
+        mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
+        if ( mpArryDevsID == NULL )
+        {
             return 0;
         }
 
         b_error = 0;
         length = 0;
-        b_error |= fseek(fd, 0, SEEK_END) < 0;
-        b_error |= (length = ftell(fd)) <= 0;
-        b_error |= fseek(fd, 0, SEEK_SET) < 0;
-        if (b_error) {
+        b_error |= fseek( fd, 0, SEEK_END ) < 0;
+        b_error |= ( length = ftell(fd) ) <= 0;
+        b_error |= fseek( fd, 0, SEEK_SET ) < 0;
+        if ( b_error )
+        {
             return 0;
         }
 
-        binary = (char*) malloc(length);
-        if (!binary) {
+        binary = (char*) malloc( length + 2 );
+        if ( !binary )
+        {
             return 0;
         }
 
-        memset(binary, 0, length);
-        b_error |= fread(binary, 1, length, fd) != length;
+        memset( binary, 0, length + 2 );
+        b_error |= fread( binary, 1, length, fd ) != length;
+        if ( binary[length - 1] != '\n' )
+        {
+            binary[length++] = '\n';
+        }
 
-        fclose(fd);
+        fclose( fd );
         fd = NULL;
         // grab the handles to all of the devices in the context.
-        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
-                sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL);
-        CHECK_OPENCL(status)
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
+                       sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
+        CHECK_OPENCL( clStatus, "clGetContextInfo" );
 
         fprintf(stderr, "Create kernel from binary\n");
-        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(gpuInfo->mpContext,
-                numDevices, mpArryDevsID, &length, (const unsigned char**) &binary,
-                &binary_status, &status);
-        CHECK_OPENCL(status)
+        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
+                                           mpArryDevsID, &length, (const unsigned char**) &binary,
+                                           &binary_status, &clStatus );
+        CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
 
-        free(binary);
-        free(mpArryDevsID);
+        free( binary );
+        free( mpArryDevsID );
         mpArryDevsID = NULL;
-    } else {
+    }
+    else
+    {
         // create a CL program using the kernel source
         fprintf(stderr, "Create kernel from source\n");
-        gpuEnv.mpArryPrograms[idx] = clCreateProgramWithSource(gpuEnv.mpContext,
-                1, &source, source_size, &status);
-        CHECK_OPENCL(status);
+        gpuEnv.mpArryPrograms[idx] = clCreateProgramWithSource( gpuEnv.mpContext, 1, &source,
+                                         source_size, &clStatus);
+        CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
     }
 
-    if (gpuInfo->mpArryPrograms[idx] == (cl_program) NULL) {
+    if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
+    {
         return 0;
     }
 
     //char options[512];
     // create a cl program executable for all the devices specified
     printf("BuildProgram.\n");
-    if (!gpuInfo->mnIsUserCreated) {
-        status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
-                                buildOption, NULL, NULL);
-    } else {
-        status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
-                                buildOption, NULL, NULL);
+    if (!gpuInfo->mnIsUserCreated)
+    {
+        clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
+                       buildOption, NULL, NULL);
+    }
+    else
+    {
+        clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
+                       buildOption, NULL, NULL);
     }
 
-    if (status != CL_SUCCESS) {
+    if ( clStatus != CL_SUCCESS )
+    {
         printf ("BuildProgram error!\n");
-        if (!gpuInfo->mnIsUserCreated) {
-            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
-                                           gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, 0, NULL,
-                                           &length);
-        } else {
-            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
-                                           gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
+        if ( !gpuInfo->mnIsUserCreated )
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
+                           CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
+        }
+        else
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
+                           CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
         }
-        if (status != CL_SUCCESS) {
+        if ( clStatus != CL_SUCCESS )
+        {
             printf("opencl create build log fail\n");
             return 0;
         }
-        buildLog = (char*) malloc(length);
-        if (buildLog == (char*) NULL) {
+        buildLog = (char*) malloc( length );
+        if ( buildLog == (char*) NULL )
+        {
             return 0;
         }
-        if (!gpuInfo->mnIsUserCreated) {
-            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
-                    gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, length,
-                    buildLog, &length);
-        } else {
-            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
-                    gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, length, buildLog,
-                    &length);
+        if ( !gpuInfo->mnIsUserCreated )
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
+                           CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
         }
-        if (status != CL_SUCCESS) {
+        else
+        {
+            clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
+                           CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
+        }
+        if ( clStatus != CL_SUCCESS )
+        {
             printf("opencl program build info fail\n");
             return 0;
         }
 
-        fd1 = fopen("kernel-build.log", "w+");
-        if (fd1 != NULL) {
-            fwrite(buildLog, 1, length, fd1);
-            fclose(fd1);
+        fd1 = fopen( "kernel-build.log", "w+" );
+        if ( fd1 != NULL )
+        {
+            fwrite( buildLog, sizeof(char), length, fd1 );
+            fclose( fd1 );
         }
 
-        free(buildLog);
+        free( buildLog );
         return 0;
     }
 
-    strcpy(gpuEnv.mArryKnelSrcFile[idx], filename);
+    strcpy( gpuEnv.mArryKnelSrcFile[idx], filename );
 
-    if (binaryExisted == 0)
-        GeneratBinFromKernelSource(gpuEnv.mpArryPrograms[idx], filename);
+    if ( binaryExisted == 0 )
+        GeneratBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename );
 
     gpuInfo->mnFileCount += 1;
 
     return 1;
-
-
 }
-int OpenclDevice::GetKernelEnvAndFunc(const char *kernelName,
-        KernelEnv *env, cl_kernel_function *function) {
-    int i; //,program_idx ;
+
+int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function)
+{
+    int i;
     //printf("----------------OpenclDevice::GetKernelEnvAndFunc\n");
-    for (i = 0; i < gpuEnv.mnKernelCount; i++) {
-        if (strcasecmp(kernelName, gpuEnv.mArrykernelNames[i]) == 0) {
+    for ( i = 0; i < gpuEnv.mnKernelCount; i++ )
+    {
+        if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 )
+        {
             env->mpkContext = gpuEnv.mpContext;
             env->mpkCmdQueue = gpuEnv.mpCmdQueue;
             env->mpkProgram = gpuEnv.mpArryPrograms[0];
@@ -580,48 +649,68 @@ int OpenclDevice::GetKernelEnvAndFunc(const char *kernelName,
     return 0;
 }
 
-int OpenclDevice::RunKernel(const char *kernelName, void **userdata) {
+int OpenclDevice::RunKernel( const char *kernelName, void **userdata)
+{
     KernelEnv kEnv;
-
     cl_kernel_function function;
-
     int status;
 
-    memset(&kEnv, 0, sizeof(KernelEnv));
-    status = GetKernelEnvAndFunc(kernelName, &kEnv, &function);
-    strcpy(kEnv.mckKernelName, kernelName);
-    if (status == 1) {
-        if (&kEnv == (KernelEnv *) NULL
-                || &function == (cl_kernel_function *) NULL) {
+    memset( &kEnv, 0, sizeof( KernelEnv ) );
+    status = GetKernelEnvAndFunc( kernelName, &kEnv, &function );
+    strcpy( kEnv.mckKernelName, kernelName );
+    if ( status == 1 )
+    {
+        if ( &kEnv == (KernelEnv *) NULL || &function == (cl_kernel_function *) NULL)
             return 0;
-        }
-        return (function(userdata, &kEnv));
+        return ( function( userdata, &kEnv ) );
     }
     return 0;
 }
 
-int OpenclDevice::InitOpenclRunEnv(int argc, const char *buildOptionKernelfiles)
+int OpenclDevice::InitOpenclRunEnv( int argc )
 {
     int status = 0;
-    if (MAX_CLKERNEL_NUM <= 0) {
+    if ( MAX_CLKERNEL_NUM <= 0 )
+    {
         return 1;
     }
-    if ((argc > MAX_CLFILE_NUM) || (argc < 0)) {
+    if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
         return 1;
-    }
 
-    if (!isInited) {
+    if ( !isInited )
+    {
         RegistOpenclKernel();
         //initialize devices, context, comand_queue
-        status = InitOpenclRunEnv(&gpuEnv);
-        if (status) {
+        status = InitOpenclRunEnv( &gpuEnv );
+        if ( status )
+        {
             printf("init_opencl_env failed.\n");
             return 1;
         }
         printf("init_opencl_env successed.\n");
         //initialize program, kernelName, kernelCount
-        status = CompileKernelFile( &gpuEnv, buildOptionKernelfiles);
-        if (status == 0 || gpuEnv.mnKernelCount == 0) {
+        if( getenv( "SC_FLOAT" ) )
+        {
+            gpuEnv.mnKhrFp64Flag = 0;
+            gpuEnv.mnAmdFp64Flag = 0;
+        }
+        if( gpuEnv.mnKhrFp64Flag )
+        {
+            printf("----use khr double type in kernel----\n");
+            status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double" );
+        }
+        else if( gpuEnv.mnAmdFp64Flag )
+        {
+            printf("----use amd double type in kernel----\n");
+            status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double" );
+        }
+        else
+        {
+            printf("----use float type in kernel----\n");
+            status = CompileKernelFile( &gpuEnv, "-Dfp_t=float" );
+        }
+        if ( status == 0 || gpuEnv.mnKernelCount == 0 )
+        {
             printf("CompileKernelFile failed.\n");
             return 1;
         }
@@ -631,10 +720,10 @@ int OpenclDevice::InitOpenclRunEnv(int argc, const char *buildOptionKernelfiles)
     return 0;
 }
 
-int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
+int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
 {
     size_t length;
-    cl_int status;
+    cl_int clStatus;
     cl_uint numPlatforms, numDevices;
     cl_platform_id *platforms;
     cl_context_properties cps[3];
@@ -643,30 +732,36 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
 
     // Have a look at the available platforms.
 
-    if (!gpuInfo->mnIsUserCreated) {
-        status = clGetPlatformIDs(0, NULL, &numPlatforms);
-        if (status != CL_SUCCESS) {
+    if ( !gpuInfo->mnIsUserCreated )
+    {
+        clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
+        if ( clStatus != CL_SUCCESS )
+        {
             return 1;
         }
         gpuInfo->mpPlatformID = NULL;
 
-        if (0 < numPlatforms) {
-            platforms = (cl_platform_id*) malloc(
-                    numPlatforms * sizeof(cl_platform_id));
-            if (platforms == (cl_platform_id*) NULL) {
+        if ( 0 < numPlatforms )
+        {
+            platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
+            if ( platforms == (cl_platform_id*) NULL )
+            {
                 return 1;
             }
-            status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+            clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
 
-            if (status != CL_SUCCESS) {
+            if ( clStatus != CL_SUCCESS )
+            {
                 return 1;
             }
 
-            for (i = 0; i < numPlatforms; i++) {
-                status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
-                        sizeof(platformName), platformName, NULL);
+            for ( i = 0; i < numPlatforms; i++ )
+            {
+                clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
+                    sizeof( platformName ), platformName, NULL );
 
-                if (status != CL_SUCCESS) {
+                if ( clStatus != CL_SUCCESS )
+                {
                     return 1;
                 }
                 gpuInfo->mpPlatformID = platforms[i];
@@ -676,96 +771,105 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
                 {
                     gpuInfo->mpPlatformID = platforms[i];
 
-                    status = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
-                                            CL_DEVICE_TYPE_ALL,    // device_type
-                                            0,                       // num_entries
-                                            NULL,                   // devices
-                                            &numDevices);
+                    clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+                                              CL_DEVICE_TYPE_GPU,    // device_type
+                                              0,                     // num_entries
+                                              NULL,                  // devices
+                                              &numDevices);
 
-                    if (status != CL_SUCCESS) {
+                    if ( clStatus != CL_SUCCESS )
                         continue;
-                    }
 
-                    if (numDevices) {
+                    if ( numDevices )
                         break;
-                    }
                 }
             }
-            if(status!=CL_SUCCESS)
+            if ( clStatus != CL_SUCCESS )
                 return 1;
-            free(platforms);
+            free( platforms );
         }
-        if (NULL == gpuInfo->mpPlatformID) {
+        if ( NULL == gpuInfo->mpPlatformID )
             return 1;
-        }
 
         // Use available platform.
-
         cps[0] = CL_CONTEXT_PLATFORM;
         cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
         cps[2] = 0;
         // Check for GPU.
         gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
-        gpuInfo->mpContext = clCreateContextFromType(cps, gpuInfo->mDevType, NULL,
-                NULL, &status);
+        gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
 
-        // If no GPU, check for CPU.
-        if ((gpuInfo->mpContext == (cl_context) NULL)
-                || (status != CL_SUCCESS)) {
+        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
+        {
             gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
-            gpuInfo->mpContext = clCreateContextFromType(cps, gpuInfo->mDevType,
-                    NULL, NULL, &status);
+            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
         }
-
-        // If no GPU or CPU, check for a "default" type.
-        if ((gpuInfo->mpContext == (cl_context) NULL)
-                || (status != CL_SUCCESS)) {
+        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
+        {
             gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
-            gpuInfo->mpContext = clCreateContextFromType(cps, gpuInfo->mDevType,
-                    NULL, NULL, &status);
+            gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
         }
-        if ((gpuInfo->mpContext == (cl_context) NULL)
-                || (status != CL_SUCCESS)) {
+        if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
             return 1;
-        }
         // Detect OpenCL devices.
         // First, get the size of device list data
-        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0,
-                NULL, &length);
-        if ((status != CL_SUCCESS) || (length == 0)) {
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
+        if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
             return 1;
-        }
         // Now allocate memory for device list based on the size we got earlier
-        gpuInfo->mpArryDevsID = (cl_device_id*) malloc(length);
-        if (gpuInfo->mpArryDevsID == (cl_device_id*) NULL) {
+        gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
+        if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
             return 1;
-        }
         // Now, get the device list data
-        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
-                gpuInfo->mpArryDevsID, NULL);
-        if (status != CL_SUCCESS) {
+        clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
+                       gpuInfo->mpArryDevsID, NULL );
+        if ( clStatus != CL_SUCCESS )
             return 1;
-        }
 
         // Create OpenCL command queue.
-        gpuInfo->mpCmdQueue = clCreateCommandQueue(gpuInfo->mpContext,
-                gpuInfo->mpArryDevsID[0], 0, &status);
+        gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
 
-        if (status != CL_SUCCESS) {
+        if ( clStatus != CL_SUCCESS )
             return 1;
-        }
     }
 
-    return 0;
+    clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
+    // Check device extensions for double type
+    size_t aDevExtInfoSize = 0;
+
+    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
+    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+
+    char *aExtInfo = new char[aDevExtInfoSize];
+
+    clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
+                   sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
+    CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
+
+    gpuInfo->mnKhrFp64Flag = 0;
+    gpuInfo->mnAmdFp64Flag = 0;
 
+    if ( strstr( aExtInfo, "cl_khr_fp64" ) )
+    {
+        gpuInfo->mnKhrFp64Flag = 1;
+    }
+    else
+    {
+        // Check if cl_amd_fp64 extension is supported
+        if ( strstr( aExtInfo, "cl_amd_fp64" ) )
+            gpuInfo->mnAmdFp64Flag = 1;
+    }
+    delete []aExtInfo;
+
+    return 0;
 }
-int OpenclDevice::RegisterKernelWrapper(const char *kernelName,cl_kernel_function function)
+int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_function function )
 {
     int i;
     //printf("oclwrapper:RegisterKernelWrapper...%d\n", gpuEnv.mnKernelCount);
-    for (i = 0; i < gpuEnv.mnKernelCount; i++)
+    for ( i = 0; i < gpuEnv.mnKernelCount; i++ )
     {
-        if (strcasecmp(kernelName, gpuEnv.mArrykernelNames[i]) == 0)
+        if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 )
         {
             gpuEnv.mpArryKnelFuncs[i] = function;
             return 1;
@@ -774,190 +878,22 @@ int OpenclDevice::RegisterKernelWrapper(const char *kernelName,cl_kernel_functio
     return 0;
 }
 
-
-void OpenclDevice::SetOpenclState(int state)
+void OpenclDevice::SetOpenclState( int state )
 {
-     //printf("OpenclDevice::setOpenclState...\n");
-     isInited = state;
+    //printf("OpenclDevice::setOpenclState...\n");
+    isInited = state;
 }
 
 int OpenclDevice::GetOpenclState()
 {
     return isInited;
 }
-//ocldbg
-
-int OclFormulax(void ** usrdata, KernelEnv *env) {
-    fprintf(stderr, "In OpenclDevice,...Formula_proc\n");
-    cl_int clStatus;
-    int status;
-    size_t global_work_size[1];
-    float tdata[NUM];
-
-    double *data = (double *) usrdata[0];
-    const formulax type = *((const formulax *) usrdata[1]);
-    double ret = 0.0;
-
-    for (int i = 0; i < NUM; i++) {
-        tdata[i] = (float) data[i];
-    }
-
-    env->mpkKernel = clCreateKernel(env->mpkProgram, "oclformula", &clStatus);
-    //printf("ScInterpreter::IterateParameters...after clCreateKernel.\n");
-    //fprintf(stderr, "\nIn OpenclDevice,...after clCreateKernel\n");
-    int size = NUM;
-
-    cl_mem formula_data = clCreateBuffer(env->mpkContext,
-            (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR),
-            size * sizeof(float), (void *) tdata, &clStatus);
-    //fprintf(stderr, "\nIn OpenclDevice,...after clCreateBuffer\n");
-
-    status = clSetKernelArg(env->mpkKernel, 0, sizeof(cl_mem),
-            (void *) &formula_data);
-    CHECK_OPENCL(status)
-    status = clSetKernelArg(env->mpkKernel, 1, sizeof(unsigned int),
-            (void *) &type);
-    CHECK_OPENCL(status)
-
-    global_work_size[0] = size;
-    //fprintf(stderr, "\nIn OpenclDevice,...after global_work_size\n");
-    //PPAStartCpuEvent(ppa_proc);
-
-    while (global_work_size[0] != 1) {
-        global_work_size[0] = global_work_size[0] / 2;
-        status = clEnqueueNDRangeKernel(env->mpkCmdQueue, env->mpkKernel, 1,
-                NULL, global_work_size, NULL, 0, NULL, NULL);
-        CHECK_OPENCL(status)
-
-    }
-    //fprintf(stderr, "\nIn OpenclDevice,...before clEnqueueReadBuffer\n");
-    status = clEnqueueReadBuffer(env->mpkCmdQueue, formula_data, CL_FALSE, 0,
-            sizeof(float), (void *) &tdata, 0, NULL, NULL);
-    CHECK_OPENCL(status)
-    status = clFinish(env->mpkCmdQueue);
-    CHECK_OPENCL(status)
-
-    //PPAStopCpuEvent(ppa_proc);
-    //fprintf(stderr, "\nIn OpenclDevice,...before clReleaseKernel\n");
-    status = clReleaseKernel(env->mpkKernel);
-    CHECK_OPENCL(status)
-    status = clReleaseMemObject(formula_data);
-    CHECK_OPENCL(status)
-
-    if (type == AVG)
-        ret = (double) tdata[0] / NUM;
-    else
-        ret = (double) tdata[0];
-
-    printf("size = %d ret = %f.\n\n", NUM, ret);
-
-    return 0;
-}
-
-int OclFormulaxDll(void ** usrdata, KernelEnv *env) {
-
-    fprintf(stderr, "In OclFormulaxDll...\n");
-    cl_int clStatus;
-    int status;
-    size_t global_work_size[1];
-    float tdata[NUM];
-
-    double *data = (double *) usrdata[0];
-    const formulax type = *((const formulax *) usrdata[1]);
-    double ret = 0.0;
-
-    for (int i = 0; i < NUM; i++) {
-        tdata[i] = (float) data[i];
-    }
-
-    env->mpkKernel = clCreateKernel(env->mpkProgram, "oclformula", &clStatus);
-    //printf("ScInterpreter::IterateParameters...after clCreateKernel.\n");
-    //fprintf(stderr, "\nIn OpenclDevice,...after clCreateKernel\n");
-    int size = NUM;
-
-    cl_mem formula_data = clCreateBuffer(env->mpkContext,
-            (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR),
-            size * sizeof(float), (void *) tdata, &clStatus);
-    //fprintf(stderr, "\nIn OpenclDevice,...after clCreateBuffer\n");
-
-    status = clSetKernelArg(env->mpkKernel, 0, sizeof(cl_mem),
-            (void *) &formula_data);
-    CHECK_OPENCL(status)
-    status = clSetKernelArg(env->mpkKernel, 1, sizeof(unsigned int),
-            (void *) &type);
-    CHECK_OPENCL(status)
-
-    global_work_size[0] = size;
-    //fprintf(stderr, "\nIn OpenclDevice,...after global_work_size\n");
-    //PPAStartCpuEvent(ppa_proc);
-
-    while (global_work_size[0] != 1) {
-        global_work_size[0] = global_work_size[0] / 2;
-        status = clEnqueueNDRangeKernel(env->mpkCmdQueue, env->mpkKernel, 1,
-                NULL, global_work_size, NULL, 0, NULL, NULL);
-        CHECK_OPENCL(status)
-
-    }
-    //fprintf(stderr, "\nIn OpenclDevice,...before clEnqueueReadBuffer\n");
-    status = clEnqueueReadBuffer(env->mpkCmdQueue, formula_data, CL_FALSE, 0,
-            sizeof(float), (void *) &tdata, 0, NULL, NULL);
-    CHECK_OPENCL(status)
-    status = clFinish(env->mpkCmdQueue);
-    CHECK_OPENCL(status)
-
-    //PPAStopCpuEvent(ppa_proc);
-    //fprintf(stderr, "\nIn OpenclDevice,...before clReleaseKernel\n");
-    status = clReleaseKernel(env->mpkKernel);
-    CHECK_OPENCL(status)
-    status = clReleaseMemObject(formula_data);
-    CHECK_OPENCL(status)
-
-    if (type == AVG)
-        ret = (double) tdata[0] / NUM;
-    else
-        ret = (double) tdata[0];
-
-    printf("OclFormulaxDllxx:size = %d ret = %f.\n\n", NUM, ret);
-
-    return 0;
-}
-
-double OclCalc::OclProcess(cl_kernel_function function, double *data, formulax type)
-{
-    fprintf(stderr, "\n OpenclDevice, proc...begin\n");
-    double ret = 0;
-    void *usrdata[2];
-    usrdata[0] = (void *) data;
-    usrdata[1] = (void *) &type;
-    RunKernelWrapper(function, "oclformula", usrdata);
-    return ret;
-}
-
-double OclCalc::OclTest() {
-    double data[NUM];
-
-    for (int i = 0; i < NUM; i++) {
-        data[i] = sc::rng::uniform();
-        fprintf(stderr, "%f\t", data[i]);
-    }
-    OclProcess(&OclFormulax, data, AVG);
-    return 0.0;
-}
-
-double OclCalc::OclTestDll() {
-    double data[NUM];
-
-    for (int i = 0; i < NUM; i++) {
-        data[i] = sc::rng::uniform();
-        fprintf(stderr, "%f\t", data[i]);
-    }
-    OclProcess(&OclFormulaxDll, data, AVG);
-    return 0.0;
-}
 
 OclCalc::OclCalc()
 {
     fprintf(stderr,"OclCalc:: init opencl ...\n");
+    nFormulaColSize = 0;
+    nFormulaRowSize = 0;
 }
 
 OclCalc::~OclCalc()
@@ -966,1168 +902,686 @@ OclCalc::~OclCalc()
 }
 
 /////////////////////////////////////////////////////////////////////////////
-#ifdef GPU_64BITS
-int OclCalc::OclHostFormulaMax(double *srcData,int *start,int *end,double *output,int size)
+int OclCalc::CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize )
 {
-    KernelEnv kEnv;
-    const char *kernelName = "oclFormulaMax";
-    CheckKernelName(&kEnv,kernelName);
-    cl_int clStatus;
-    size_t global_work_size[1];
-    int alignSize = size + end[0]-start[0];
-
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
-    cl_int ret=0;
-    cl_mem inputCl = clCreateBuffer(kEnv.mpkContext,(cl_mem_flags) (CL_MEM_READ_WRITE),
-        alignSize * sizeof(float), NULL, &clStatus);
-    cl_mem startCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-        size * sizeof(unsigned int), NULL, &ret);
-    cl_mem endCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-        size * sizeof(unsigned int), NULL, &ret);
-    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-    size* sizeof(float), NULL, &ret);
-
-    float * hostMapSrc = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
-    int * hostMapStart = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-    int * hostMapEnd   = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-    for(int i=0;i<size;i++)
-    {
-        hostMapStart[i] = start[i];
-        hostMapEnd[i]    = end[i];
-    }
-    for(int i=0;i<alignSize;i++)
-        hostMapSrc[i] = (float)srcData[i];
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,inputCl,hostMapSrc,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,startCl,hostMapStart,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,endCl,hostMapEnd,0,NULL,NULL);
-
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&inputCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&startCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&endCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
-        (void *)&outputCl);
-    CHECK_OPENCL(clStatus);
-
-    global_work_size[0] = size;
-    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(clStatus);
-
-    float * outPutMap = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<size;i++)
-        output[i]=outPutMap[i];
-
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,outPutMap,0,NULL,NULL);
-    clStatus = clFinish(kEnv.mpkCmdQueue);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(inputCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(startCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(endCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(outputCl);
-    CHECK_OPENCL(clStatus);
+    cl_int clStatus = 0;
+    SetKernelEnv( &kEnv );
+
+    mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                            nBufferSize * sizeof(double), NULL, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer" );
+    mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                             nBufferSize * sizeof(unsigned int), NULL, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer" );
+    mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                             nBufferSize * sizeof(unsigned int), NULL, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer" );
+
+    dpSrcData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
+                               nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
+    CHECK_OPENCL(clStatus,"clEnqueueMapBuffer");
+    clFinish(kEnv.mpkCmdQueue);
+    npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
+                             nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
+    CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
+    clFinish( kEnv.mpkCmdQueue );
+    npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
+                             nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
+    CHECK_OPENCL( clStatus,"clEnqueueMapBuffer" );
+    clFinish( kEnv.mpkCmdQueue );
+    //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
     return 0;
 }
 
-int OclCalc::OclHostFormulaMin(double *srcData,int *start,int *end,double *output,int size)
+int OclCalc::CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize )
 {
-    KernelEnv kEnv;
-    const char *kernelName = "oclFormulaMin";
-    CheckKernelName(&kEnv,kernelName);
-    cl_int clStatus;
-    size_t global_work_size[1];
-    int alignSize = size + end[0]-start[0];
-
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
-    cl_int ret=0;
-    cl_mem inputCl = clCreateBuffer(kEnv.mpkContext,(cl_mem_flags) (CL_MEM_READ_WRITE),
-        alignSize * sizeof(float), NULL, &clStatus);
-    cl_mem startCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-        size * sizeof(unsigned int), NULL, &ret);
-    cl_mem endCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-        size * sizeof(unsigned int), NULL, &ret);
-    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-    size* sizeof(float), NULL, &ret);
-
-    float * hostMapSrc = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
-    int * hostMapStart = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-    int * hostMapEnd   = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-    for(int i=0;i<size;i++)
-    {
-        hostMapStart[i] = start[i];
-        hostMapEnd[i]    = end[i];
-    }
-    for(int i=0;i<alignSize;i++)
-        hostMapSrc[i] = (float)srcData[i];
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,inputCl,hostMapSrc,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,startCl,hostMapStart,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,endCl,hostMapEnd,0,NULL,NULL);
-
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&inputCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&startCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&endCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
-        (void *)&outputCl);
-    CHECK_OPENCL(clStatus);
-
-    global_work_size[0] = size;
-    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(clStatus);
-
-    float * outPutMap = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<size;i++)
-        output[i]=outPutMap[i];
-
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,outPutMap,0,NULL,NULL);
-    clStatus = clFinish(kEnv.mpkCmdQueue);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(inputCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(startCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(endCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(outputCl);
-    CHECK_OPENCL(clStatus);
-    return 0;
-}
-int OclCalc::OclHostFormulaAverage(double *srcData,int *start,int *end,double *output,int size)
-{
-    KernelEnv kEnv;
-    const char *kernelName = "oclFormulaAverage";
-    CheckKernelName(&kEnv,kernelName);
-    cl_int clStatus;
-    size_t global_work_size[1];
-    int alignSize = size + end[0]-start[0];
-
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
-    cl_int ret=0;
-    cl_mem inputCl    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
-                                     alignSize * sizeof(float), NULL, &clStatus);
-    cl_mem startCl    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
-                                     size * sizeof(unsigned int), NULL, &ret);
-    cl_mem endCl    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
-                                     size * sizeof(unsigned int), NULL, &ret);
-    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
-                                     size* sizeof(float), NULL, &ret);
-
-    float * hostMapSrc = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,inputCl,CL_TRUE,OPENCLWRAPPER_CL_MAP_WRITE_FLAG,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
-    int * hostMapStart = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,startCl,CL_TRUE,OPENCLWRAPPER_CL_MAP_WRITE_FLAG,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-    int * hostMapEnd   = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,endCl,CL_TRUE,OPENCLWRAPPER_CL_MAP_WRITE_FLAG,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-#if 1
-    for(int i=0;i<size;i++)
-    {
-        hostMapStart[i] = start[i];
-        hostMapEnd[i]    = end[i];
-    }
-    for(int i=0;i<alignSize;i++)
-        hostMapSrc[i] = (float)srcData[i];
-    //memcpy(hostMapSrc,srcData,alignSize * sizeof(float));
-#endif
-    for(sal_Int32 i = 0; i < alignSize; ++i){//dbg
-                       fprintf(stderr,"In avg host,hostMapSrc[%d] is ...%f\n",i,hostMapSrc[i]);
-                   }
-
-
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,inputCl,hostMapSrc,  0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,startCl,hostMapStart,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,endCl,  hostMapEnd,  0,NULL,NULL);
-
-
-
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&inputCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&startCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&endCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
-        (void *)&outputCl);
-    CHECK_OPENCL(clStatus);
-
-    global_work_size[0] = size;
-    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(clStatus);
-
-    float * outPutMap = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
-
-    for(int i=0;i<size;i++){
-       //fprintf(stderr,"In avg host,outPutMap[%d] is ...%f\n",i,outPutMap[i]);
-        output[i]=outPutMap[i];
-    }
-
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,outPutMap,0,NULL,NULL);
-    clStatus = clFinish(kEnv.mpkCmdQueue);
-
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(inputCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(startCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(endCl);
-    CHECK_OPENCL(clStatus);
-    clStatus = clReleaseMemObject(outputCl);
-    CHECK_OPENCL(clStatus);
-    return 0;
-
-
-}
-
-int OclCalc::OclHostSignedAdd(double *lData,double *rData,double *rResult,int dSize) {
-
-    KernelEnv kEnv;
-    int status;
-    const char *kernelName = "oclSignedAdd";
-    CheckKernelName(&kEnv,kernelName);
-
-
-    cl_int clStatus;
-    size_t global_work_size[1];
-
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
-    cl_mem clLiftData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clRightData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clResult = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-
-    float * hostMapLeftData     = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clLiftData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    float * hostMapRightData    = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clRightData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-    {
-        hostMapLeftData[i]    = (float)lData[i];
-        hostMapRightData[i] = (float)rData[i];
-    }
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clLiftData,hostMapLeftData,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clRightData,hostMapRightData,0,NULL,NULL);
-
-    status = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&clLiftData);
-    status = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&clRightData);
-    status = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&clResult);
-    CHECK_OPENCL(status)
-    global_work_size[0] = dSize;
-    status = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(status);
-
-    float * hostMapResult = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clResult,CL_TRUE,CL_MAP_READ,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-        rResult[i]=hostMapResult[i];
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clResult,hostMapResult,0,NULL,NULL);
-
-    CHECK_OPENCL(status);
-    status = clFinish(kEnv.mpkCmdQueue);
-    CHECK_OPENCL(status);
-    status = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clLiftData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clRightData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clResult);
-    CHECK_OPENCL(status);
-    return 0;
-}
-
-int OclCalc::OclHostSignedSub(double *lData,double *rData,double *rResult,int dSize) {
-    KernelEnv kEnv;
-    int status;
-    const char *kernelName = "oclSignedSub";
-    CheckKernelName(&kEnv,kernelName);
-
-    cl_int clStatus;
-    size_t global_work_size[1];
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
-    cl_mem clLiftData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clRightData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clResult = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-
-    float * hostMapLeftData     = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clLiftData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    float * hostMapRightData    = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clRightData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-    {
-        hostMapLeftData[i]    = (float)lData[i];
-        hostMapRightData[i] = (float)rData[i];
-    }
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clLiftData,hostMapLeftData,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clRightData,hostMapRightData,0,NULL,NULL);
-
-    status = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&clLiftData);
-    status = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&clRightData);
-    status = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&clResult);
-    CHECK_OPENCL(status)
-    global_work_size[0] = dSize;
-    status = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(status);
-
-    float * hostMapResult = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clResult,CL_TRUE,CL_MAP_READ,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-        rResult[i]=hostMapResult[i];
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clResult,hostMapResult,0,NULL,NULL);
-
-    CHECK_OPENCL(status);
-    status = clFinish(kEnv.mpkCmdQueue);
-    CHECK_OPENCL(status);
-    status = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clLiftData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clRightData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clResult);
-    CHECK_OPENCL(status);
-    return 0;
-}
-
-int OclCalc::OclHostSignedMul(double *lData,double *rData,double *rResult,int dSize) {
-    KernelEnv kEnv;
-    int status;
-    const char *kernelName = "oclSignedMul";
-    CheckKernelName(&kEnv,kernelName);
-
-
-    size_t global_work_size[1];
-    cl_int clStatus;
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram, kernelName, &clStatus);
-    cl_mem clLiftData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clRightData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clResult = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-
-    float * hostMapLeftData     = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clLiftData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    float * hostMapRightData    = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clRightData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-    {
-        hostMapLeftData[i]    = (float)lData[i];
-        hostMapRightData[i] = (float)rData[i];
-    }
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clLiftData,hostMapLeftData,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clRightData,hostMapRightData,0,NULL,NULL);
-
-    status = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&clLiftData);
-    status = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&clRightData);
-    status = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&clResult);
-    CHECK_OPENCL(status)
-    global_work_size[0] = dSize;
-    status = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(status);
-
-    float * hostMapResult = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clResult,CL_TRUE,CL_MAP_READ,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-        rResult[i]=hostMapResult[i];
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clResult,hostMapResult,0,NULL,NULL);
-
-    CHECK_OPENCL(status);
-    status = clFinish(kEnv.mpkCmdQueue);
-    CHECK_OPENCL(status);
-    status = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clLiftData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clRightData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clResult);
-    CHECK_OPENCL(status);
-    return 0;
-}
-
-int OclCalc::OclHostSignedDiv(double *lData,double *rData,double *rResult,int dSize) {
-    KernelEnv kEnv;
-    int status;
-    const char *kernelName = "oclSignedDiv";
-    CheckKernelName(&kEnv,kernelName);
-
-
-    cl_int clStatus;
-    size_t global_work_size[1];
-    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
-    cl_mem clLiftData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clRightData = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-    cl_mem clResult = clCreateBuffer(kEnv.mpkContext,
-        (cl_mem_flags) (CL_MEM_READ_WRITE),
-        dSize * sizeof(float), NULL, &clStatus);
-
-    float * hostMapLeftData     = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clLiftData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    float * hostMapRightData    = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clRightData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-    {
-        hostMapLeftData[i]    = (float)lData[i];
-        hostMapRightData[i] = (float)rData[i];
-    }
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clLiftData,hostMapLeftData,0,NULL,NULL);
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clRightData,hostMapRightData,0,NULL,NULL);
-
-    status = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
-        (void *)&clLiftData);
-    status = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
-        (void *)&clRightData);
-    status = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
-        (void *)&clResult);
-    CHECK_OPENCL(status);
-    global_work_size[0] = dSize;
-    status = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
-        NULL, global_work_size, NULL, 0, NULL, NULL);
-    CHECK_OPENCL(status);
-
-    float * hostMapResult = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,clResult,CL_TRUE,CL_MAP_READ,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-    for(int i=0;i<dSize;i++)
-        rResult[i]=hostMapResult[i];
-    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,clResult,hostMapResult,0,NULL,NULL);
-
-    CHECK_OPENCL(status);
-    status = clFinish(kEnv.mpkCmdQueue);
-    CHECK_OPENCL(status);
-    status = clReleaseKernel(kEnv.mpkKernel);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clLiftData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clRightData);
-    CHECK_OPENCL(status);
-    status = clReleaseMemObject(clResult);
-    CHECK_OPENCL(status);
+    cl_int clStatus = 0;
+    SetKernelEnv( &kEnv );
+
+    mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                          nBufferSize * sizeof(double), NULL, &clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer" );
+    mpClmemRightData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                           nBufferSize * sizeof(double), NULL, &clStatus );
+    CHECK_OPENCL( clStatus,"clCreateBuffer" );
+    dpLeftData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue,mpClmemLeftData,CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION,0,
+                               nBufferSize * sizeof(double),0,NULL,NULL,&clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer" );
+    clFinish(kEnv.mpkCmdQueue);
+    dpRightData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue,mpClmemRightData,CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION,0,
+                                nBufferSize * sizeof(double),0,NULL,NULL,&clStatus );
+    CHECK_OPENCL( clStatus, "clCreateBuffer" );
+    clFinish( kEnv.mpkCmdQueue );
+    //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
     return 0;
 }
-#endif // GPU_64BITS
-int OclCalc::CreateBuffer(float *&fpSrcData,uint *&npStartPos,uint *&npEndPos,int nBufferSize)
+int OclCalc::CreateBuffer64Bits( double *&dpSumProMergeLfData, double *&dpSumProMergeRtData, uint *&npSumSize, int nMatixSize, int nBufferSize )
 {
     cl_int clStatus = 0;
-    SetKernelEnv(&kEnv);
-
-    mpClmemSrcData    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
-                             nBufferSize * sizeof(float), NULL, &clStatus);
-    CHECK_OPENCL(clStatus);
-    mpClmemStartPos    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
-                             nBufferSize * sizeof(unsigned int), NULL, &clStatus);
-    CHECK_OPENCL(clStatus);
-    mpClmemEndPos    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
-                             nBufferSize * sizeof(unsigned int), NULL, &clStatus);
-    CHECK_OPENCL(clStatus);
-
-    fpSrcData  = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,mpClmemSrcData,CL_TRUE,OPENCLWRAPPER_CL_MAP_WRITE_FLAG,0,nBufferSize * sizeof(float),0,NULL,NULL,&clStatus);
-    CHECK_OPENCL(clStatus);
-    npStartPos = (uint *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,mpClmemStartPos,CL_TRUE,OPENCLWRAPPER_CL_MAP_WRITE_FLAG,0,nBufferSize * sizeof(uint),0,NULL,NULL,&clStatus);
-    CHECK_OPENCL(clStatus);
-    npEndPos   = (uint *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,mpClmemEndPos,CL_TRUE,OPENCLWRAPPER_CL_MAP_WRITE_FLAG,0,nBufferSize * sizeof(uint),0,NULL,NULL,&clStatus);
-    CHECK_OPENCL(clStatus);

... etc. - the rest is truncated


More information about the Libreoffice-commits mailing list