[Libreoffice-commits] core.git: Branch 'feature/formula-core-rework' - 6 commits - sc/source

Michael Meeks michael.meeks at suse.com
Wed Jul 10 09:11:13 PDT 2013


 sc/source/core/data/formulacell.cxx      |    8 
 sc/source/core/opencl/formulagroupcl.cxx |  264 ++-
 sc/source/core/opencl/oclkernels.hxx     |  207 +-
 sc/source/core/opencl/openclwrapper.cxx  | 2159 +++++++++++++++++++++----------
 sc/source/core/opencl/openclwrapper.hxx  |  145 +-
 sc/source/core/tool/formulagroup.cxx     |    1 
 6 files changed, 1894 insertions(+), 890 deletions(-)

New commits:
commit 342e6908e22edf3b9443b2997d82d4ff75ad495d
Author: Michael Meeks <michael.meeks at suse.com>
Date:   Tue Jul 9 12:02:43 2013 +0100

    better opencl error reporting / diagnostics.

diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 31f1589..0c94721 100755
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -2131,6 +2131,7 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
     size_t global_work_size[1];
 
     kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+    CHECK_OPENCL(clStatus);
 
     // Ugh - horrible redundant copying ...
     cl_mem valuesCl   = allocateDoubleBuffer(kEnv, pOpArray, nElements, &clStatus);
@@ -2147,12 +2148,16 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
 
     clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
                               (void *)&valuesCl);
+    CHECK_OPENCL(clStatus);
     clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
                               (void *)&subtractCl);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
+    CHECK_OPENCL(clStatus);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_int),
                               (void *)&start);
-    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
+    CHECK_OPENCL(clStatus);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_int),
                               (void *)&end);
+    CHECK_OPENCL(clStatus);
     clStatus = clSetKernelArg(kEnv.mpkKernel, 4, sizeof(cl_mem),
                               (void *)&outputCl);
     CHECK_OPENCL(clStatus);
diff --git a/sc/source/core/opencl/openclwrapper.hxx b/sc/source/core/opencl/openclwrapper.hxx
index a0c132a..fe62554 100755
--- a/sc/source/core/opencl/openclwrapper.hxx
+++ b/sc/source/core/opencl/openclwrapper.hxx
@@ -12,6 +12,7 @@
 
 #include <config_features.h>
 #include <formula/opcode.hxx>
+#include <sal/detail/log.h>
 #include <cassert>
 #include <CL/cl.h>
 #endif
@@ -55,7 +56,7 @@ typedef int (*cl_kernel_function)(void **userdata, KernelEnv *kenv);
 #define CHECK_OPENCL(status)              \
 if(status != CL_SUCCESS)                  \
 {                                          \
-    printf ("error code is %d.\n",status);    \
+    printf ("OpenCL error code is %d at " SAL_DETAIL_WHERE "\n", status);    \
     return 0;                            \
 }
 
commit 8e60c2319b1f959200033bc3d89dca7d92815988
Author: Michael Meeks <michael.meeks at suse.com>
Date:   Mon Jul 8 21:49:31 2013 +0100

    try harder to setup the kernel environment.

diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 3030a2e..31f1589 100755
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -2111,6 +2111,7 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
                                          const double *pSubtractSingle, size_t nElements)
 {
     KernelEnv kEnv;
+    SetKernelEnv(&kEnv);
 
     // select a kernel: cut & paste coding is utterly evil.
     const char *kernelName;
commit 7fe2888b5a4f8d434d2d662bbf948bac2501b695
Author: Michael Meeks <michael.meeks at suse.com>
Date:   Mon Jul 8 21:35:26 2013 +0100

    cleanup formulagroupcl and add opencl kernel for averagedelta.
    
    Conflicts:
    	sc/source/core/opencl/openclwrapper.hxx
    
    Change-Id: Id4777d3854d34ab34dd29b050cd329a803023a39

diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index d92a471..8bc0224 100755
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -310,11 +310,8 @@ bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAdd
     OpCode eOp; // type of operation: ocAverage, ocMax, ocMin
     const double *pArrayToSubtractOneElementFrom;
     const double *pGroundWaterDataArray;
-    size_t        nGroundWaterDataArrayLen;
 
     // Output:
-    double *pResult = new double[xGroup->mnLength];
-    RETURN_IF_FAIL(pResult != NULL, "buffer alloc failed");
     std::vector<double> aMatrixContent;
 
     const formula::FormulaToken *p;
@@ -338,36 +335,13 @@ bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAdd
 
             p = rCode.NextNoSpaces();
             RETURN_IF_FAIL(p != NULL, "no function argument");
-            if (p->GetType() == formula::svDoubleVectorRef)
-            {
-                // FIXME: this is what I would expect; but table1.cxx's
-                // ScColumn::ResolveStaticReference as called from
-                // GroupTokenConverter::convert returns an ScMatrixToken un-conditionally
-                const formula::DoubleVectorRefToken* pDvr = static_cast<const formula::DoubleVectorRefToken*>(p);
-                const std::vector<const double*>& rArrays = pDvr->GetArrays();
-                RETURN_IF_FAIL(rArrays.size() == 1, "unexpectedly large double ref array");
-                RETURN_IF_FAIL(pDvr->GetArrayLength() == (size_t)xGroup->mnLength, "wrong double ref length");
-                RETURN_IF_FAIL(pDvr->IsStartFixed() && pDvr->IsEndFixed(), "non-fixed ranges )");
-                pGroundWaterDataArray = rArrays[0];
-                nGroundWaterDataArrayLen = xGroup->mnLength;
-            }
-            else
-            {
-                RETURN_IF_FAIL(p->GetType() == formula::svMatrix, "unexpected fn. param type");
-                const ScMatrixToken *pMatTok = static_cast<const ScMatrixToken *>(p);
-                pMatTok->GetMatrix()->GetDoubleArray( aMatrixContent );
-                // FIXME: horrible hackery: the legacy / excel shared formula oddness,
-                // such that the 1st entry is not truly shared, making these a different
-                // shape.
-                if (aMatrixContent.size() > (size_t)xGroup->mnLength + 1)
-                {
-                    fprintf(stderr, "Error size range mismatch: %ld vs %ld\n",
-                            (long)aMatrixContent.size(), (long)xGroup->mnLength);
-                    return false;
-                }
-                pGroundWaterDataArray = &aMatrixContent[0];
-                nGroundWaterDataArrayLen = aMatrixContent.size();
-            }
+            RETURN_IF_FAIL(p->GetType() == formula::svDoubleVectorRef, "wrong type of fn argument");
+            const formula::DoubleVectorRefToken* pDvr = static_cast<const formula::DoubleVectorRefToken*>(p);
+            const std::vector<const double*>& rArrays = pDvr->GetArrays();
+            RETURN_IF_FAIL(rArrays.size() == 1, "unexpectedly large double ref array");
+            RETURN_IF_FAIL(pDvr->GetArrayLength() == (size_t)xGroup->mnLength, "wrong double ref length");
+            RETURN_IF_FAIL(pDvr->IsStartFixed() && pDvr->IsEndFixed(), "non-fixed ranges )");
+            pGroundWaterDataArray = rArrays[0];
 
             p = rCode.NextNoSpaces();
             RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocClose, "missing closing )");
@@ -400,17 +374,22 @@ bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAdd
     //   =AVERAGE(L$6:L$7701) - L6
     // we would get:
     //   eOp => ocAverage
-    //   pGroundWaterDataArray => contains L$6:L$7701
-    //   pGroundWaterDataArrayLen => 7701 - 6 + 1
-    //   pArrayToSubtractOneElementFrom => contains L$5:L$7701 (overlapping)
+    //   pGroundWaterDataArray => contains L$5:L$7701
+    //   pArrayToSubtractOneElementFrom => contains L$5:L$7701 (ie. a copy)
     //   length of this array -> xGroup->mnLength
 
     fprintf (stderr, "Calculate !\n");
 
+    double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
+                                                        pArrayToSubtractOneElementFrom,
+                                                        (size_t) xGroup->mnLength );
+    RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed");
+
     // Insert the double data, in rResult[i] back into the document
     rDoc.SetFormulaResults(rTopPos, pResult, xGroup->mnLength);
 
     delete [] pResult;
+
     SAL_DEBUG ("exit cleanly !");
     return true;
 }
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
index 6c90126..e13c24a 100755
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -142,7 +142,6 @@ __kernel void oclFormulaAverage(__global float *input,__global int *start,__glob
     for(i = start[id];i<=end[id];i++)
         sum += input[i];
     output[id] = sum / (end[id]-start[id]+1);
-
 }
 
 //Sumproduct
@@ -162,6 +161,33 @@ __kernel void oclFormulaMinverse(__global float *data,
 
 }
 
+// Double precision is a requirement of spreadsheets
+#if 0
+#if defined(cl_khr_fp64)  // Khronos extension
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)  // AMD extension
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#endif
+typedef double fp_t;
+#else
+typedef float fp_t;
+#endif
+
+__kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, __global int start, __global int end, __global fp_t *output)
+{
+    const unsigned int id = get_global_id(0);
+
+    // Average
+    int i;
+    fp_t sum = 0.0;
+    for(i = start; i < end; i++)
+        sum += values[i];
+    fp_t val = sum/(end-start);
+
+    // Subtract & output
+    output[id] = val - subtract[id];
+}
+
 );
 
 #endif // USE_EXTERNAL_KERNEL
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 597f370..3030a2e 100755
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -76,16 +76,17 @@ int OpenclDevice::ReleaseOpenclRunEnv() {
 }
 ///////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////
-inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
+inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName)
+{
     strcpy(gpuEnv.mArrykernelNames[kCount], kName);
     gpuEnv.mnKernelCount++;
     return 0;
 }
 
-int OpenclDevice::RegistOpenclKernel() {
-    if (!gpuEnv.mnIsUserCreated) {
+int OpenclDevice::RegistOpenclKernel()
+{
+    if (!gpuEnv.mnIsUserCreated)
         memset(&gpuEnv, 0, sizeof(gpuEnv));
-    }
 
     gpuEnv.mnFileCount = 0; //argc;
     gpuEnv.mnKernelCount = 0UL;
@@ -100,17 +101,22 @@ int OpenclDevice::RegistOpenclKernel() {
     AddKernelConfig(7, (const char*) "oclFormulaSumproduct");
     AddKernelConfig(8, (const char*) "oclFormulaMinverse");
 
-    AddKernelConfig(9,    (const char*) "oclSignedAdd");
+    AddKernelConfig(9,  (const char*) "oclSignedAdd");
     AddKernelConfig(10, (const char*) "oclSignedSub");
     AddKernelConfig(11, (const char*) "oclSignedMul");
     AddKernelConfig(12, (const char*) "oclSignedDiv");
+    AddKernelConfig(13, (const char*) "oclAverageDelta");
+
     return 0;
 }
-OpenclDevice::OpenclDevice(){
+
+OpenclDevice::OpenclDevice()
+{
     //InitEnv();
 }
 
-OpenclDevice::~OpenclDevice() {
+OpenclDevice::~OpenclDevice()
+{
     //ReleaseOpenclRunEnv();
 }
 
@@ -122,13 +128,15 @@ int OpenclDevice::SetKernelEnv(KernelEnv *envInfo)
 
     return 1;
 }
-int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName){
+
+int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName)
+{
     //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
     int kCount;
     for(kCount=0; kCount < gpuEnv.mnKernelCount; kCount++) {
         if(strcasecmp(kernelName, gpuEnv.mArrykernelNames[kCount]) == 0) {
-        printf("match  %s kernel right\n",kernelName);
-        break;
+            printf("match %s kernel right\n",kernelName);
+            break;
         }
     }
     envInfo->mpkKernel = gpuEnv.mpArryKernels[kCount];
@@ -141,7 +149,8 @@ int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName){
     return 1;
 }
 
-int OpenclDevice::ConvertToString(const char *filename, char **source) {
+int OpenclDevice::ConvertToString(const char *filename, char **source)
+{
     int file_size;
     size_t result;
     FILE *file = NULL;
@@ -174,8 +183,9 @@ int OpenclDevice::ConvertToString(const char *filename, char **source) {
     return 0;
 }
 
-int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle) {
-        unsigned int i = 0;
+int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle)
+{
+    unsigned int i = 0;
     cl_int status;
     char *str = NULL;
     FILE *fd = NULL;
@@ -208,7 +218,8 @@ int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle) {
 }
 
 int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
-        size_t numBytes) {
+        size_t numBytes)
+{
     FILE *output = NULL;
     output = fopen(fileName, "wb");
     if (output == NULL) {
@@ -223,7 +234,8 @@ int OpenclDevice::WriteBinaryToFile(const char* fileName, const char* birary,
 }
 
 int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
-        const char * clFileName) {
+                                             const char * clFileName)
+{
      unsigned int i = 0;
     cl_int status;
     size_t *binarySizes, numDevices;
@@ -319,10 +331,10 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
     return 1;
 }
 
-int OpenclDevice::InitOpenclAttr(OpenCLEnv * env) {
-    if (gpuEnv.mnIsUserCreated) {
+int OpenclDevice::InitOpenclAttr(OpenCLEnv * env)
+{
+    if (gpuEnv.mnIsUserCreated)
         return 1;
-    }
 
     gpuEnv.mpContext    = env->mpOclContext;
     gpuEnv.mpPlatformID = env->mpOclPlatformID;
@@ -334,21 +346,24 @@ int OpenclDevice::InitOpenclAttr(OpenCLEnv * env) {
     return 0;
 }
 
-int OpenclDevice::CreateKernel(char * kernelname, KernelEnv * env) {
+int OpenclDevice::CreateKernel(char * kernelname, KernelEnv * env)
+{
     int status;
 
-    env->mpkKernel     = clCreateKernel(gpuEnv.mpArryPrograms[0], kernelname, &status);
+    env->mpkKernel   = clCreateKernel(gpuEnv.mpArryPrograms[0], kernelname, &status);
     env->mpkContext  = gpuEnv.mpContext;
     env->mpkCmdQueue = gpuEnv.mpCmdQueue;
     return status != CL_SUCCESS ? 1 : 0;
 }
 
-int OpenclDevice::ReleaseKernel(KernelEnv * env) {
+int OpenclDevice::ReleaseKernel(KernelEnv * env)
+{
     int status = clReleaseKernel(env->mpkKernel);
     return status != CL_SUCCESS ? 1 : 0;
 }
 
-int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
+int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo)
+{
     int i = 0;
     int status = 0;
 
@@ -378,7 +393,8 @@ int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
 }
 
 int OpenclDevice::RunKernelWrapper(cl_kernel_function function,
-        const char * kernelName, void **usrdata) {
+                                   const char * kernelName, void **usrdata)
+{
     printf("oclwrapper:RunKernel_wrapper...\n");
     if (RegisterKernelWrapper(kernelName, function) != 1) {
         fprintf(stderr,
@@ -389,8 +405,9 @@ int OpenclDevice::RunKernelWrapper(cl_kernel_function function,
 }
 
 int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached,
-        const char * clFileName) {
-  int i;
+                                    const char * clFileName)
+{
+    int i;
     for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
         if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
             if (gpuEnvCached->mpArryPrograms[i] != NULL) {
@@ -574,6 +591,7 @@ int OpenclDevice::RunKernel(const char *kernelName, void **userdata) {
     }
     return 0;
 }
+
 int OpenclDevice::InitOpenclRunEnv(int argc, const char *buildOptionKernelfiles)
 {
     int status = 0;
@@ -1007,6 +1025,7 @@ int OclCalc::OclHostFormulaMax(double *srcData,int *start,int *end,double *outpu
     CHECK_OPENCL(clStatus);
     return 0;
 }
+
 int OclCalc::OclHostFormulaMin(double *srcData,int *start,int *end,double *output,int size)
 {
     KernelEnv kEnv;
@@ -1590,7 +1609,6 @@ int OclCalc::OclHostFormulaMin32Bits(float *fpSrcData,uint *npStartPos,uint *npE
     clStatus = clReleaseMemObject(outputCl);
     CHECK_OPENCL(clStatus);
     return 0;
-
 }
 
 int OclCalc::OclHostFormulaAverage32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int size)
@@ -2066,4 +2084,111 @@ int OclCalc::OclHostFormulaSumProduct(float *dpSrcData,int *npStart,int *npEnd,f
 }
 #endif
 
+#if 0
+typedef double fp_;
+#else
+typedef float fp_t;
+#endif
+
+// FIXME: should be templatised in <double> - double buffering [sic] rocks
+static cl_mem allocateDoubleBuffer(KernelEnv &rEnv, const double *_pValues,
+                                   size_t nElements, cl_int *pStatus)
+{
+    // Ugh - horrible redundant copying ...
+    cl_mem xValues = clCreateBuffer(rEnv.mpkContext,(cl_mem_flags) (CL_MEM_READ_WRITE),
+                                    nElements * sizeof(double), NULL, pStatus);
+    fp_t *pValues = (fp_t *)clEnqueueMapBuffer(rEnv.mpkCmdQueue,xValues,CL_TRUE,CL_MAP_WRITE,0,
+                                                   nElements * sizeof(fp_t),0,NULL,NULL,NULL);
+    for(int i=0;i<nElements;i++)
+        pValues[i] = (fp_t)_pValues[i];
+
+    clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
+
+    return xValues;
+}
+
+double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray,
+                                         const double *pSubtractSingle, size_t nElements)
+{
+    KernelEnv kEnv;
+
+    // select a kernel: cut & paste coding is utterly evil.
+    const char *kernelName;
+    switch (eOp) {
+    case ocMax:
+    case ocMin:
+        ; // FIXME: fallthrough for now
+    case ocAverage:
+        kernelName = "oclAverageDelta";
+        break;
+    default:
+        assert(false);
+    }
+    CheckKernelName(&kEnv,kernelName);
+
+    cl_int clStatus;
+    size_t global_work_size[1];
+
+    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+
+    // Ugh - horrible redundant copying ...
+    cl_mem valuesCl   = allocateDoubleBuffer(kEnv, pOpArray, nElements, &clStatus);
+    cl_mem subtractCl = allocateDoubleBuffer(kEnv, pSubtractSingle, nElements, &clStatus);
+
+    cl_int start = 0;
+    cl_int end = (cl_int) nElements;
+    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext,
+                                     CL_MEM_READ_WRITE,
+                                     nElements * sizeof(fp_t),
+                                     NULL,
+                                     &clStatus);
+    CHECK_OPENCL(clStatus);
+
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
+                              (void *)&valuesCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
+                              (void *)&subtractCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
+                              (void *)&start);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
+                              (void *)&end);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 4, sizeof(cl_mem),
+                              (void *)&outputCl);
+    CHECK_OPENCL(clStatus);
+
+    fprintf(stderr, "prior to enqueue range kernel\n");
+
+    global_work_size[0] = nElements;
+    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+                                      NULL, global_work_size, NULL, 0, NULL, NULL);
+    CHECK_OPENCL(clStatus);
+
+    double *pResult = new double[nElements];
+    if(!pResult)
+        return NULL; // leak.
+
+    fp_t *pOutput = (fp_t *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,
+                                               CL_MAP_READ,0,nElements*sizeof(fp_t),
+                                               0,NULL,NULL,NULL);
+    for(int i = 0; i < nElements; i++)
+        pResult[i] = (double)pOutput[i];
+
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,pOutput,0,NULL,NULL);
+
+    clStatus = clFinish(kEnv.mpkCmdQueue);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseKernel(kEnv.mpkKernel);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(valuesCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(subtractCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(outputCl);
+    CHECK_OPENCL(clStatus);
+
+    fprintf(stderr, "completed opencl delta operation\n");
+
+    return pResult;
+}
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/opencl/openclwrapper.hxx b/sc/source/core/opencl/openclwrapper.hxx
index 3e87f84..a0c132a 100755
--- a/sc/source/core/opencl/openclwrapper.hxx
+++ b/sc/source/core/opencl/openclwrapper.hxx
@@ -11,10 +11,8 @@
 #define _OPENCL_WRAPPER_H_
 
 #include <config_features.h>
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
+#include <formula/opcode.hxx>
+#include <cassert>
 #include <CL/cl.h>
 #endif
 
@@ -212,6 +210,7 @@ public:
     int OclHostFormulaMax32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int outputSize);
     int OclHostFormulaMin32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int outputSize);
     int OclHostFormulaAverage32Bits(float *fpSrcData,uint *npStartPos,uint *npEndPos,double *output,int outputSize);
+    double *OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements);
 
     //int OclHostFormulaCount(int *startPos,int *endPos,float *output,int outputSize);
     //int OclHostFormulaSum(float *srcData,int *startPos,int *endPos,float *output,int outputSize);
commit ca05948520906052d8bd608231970873ebf01414
Author: Michael Meeks <michael.meeks at suse.com>
Date:   Mon Jul 8 14:17:35 2013 +0100

    Add new opencl placeholder backend for specific scenarios.

diff --git a/sc/source/core/data/formulacell.cxx b/sc/source/core/data/formulacell.cxx
index 17653c9..55f5010 100644
--- a/sc/source/core/data/formulacell.cxx
+++ b/sc/source/core/data/formulacell.cxx
@@ -3064,6 +3064,14 @@ public:
 
     bool convert(ScTokenArray& rCode)
     {
+        { // debug to start with:
+            ScCompiler aComp( &mrDoc, mrPos, rCode);
+            aComp.SetGrammar(formula::FormulaGrammar::GRAM_NATIVE_XL_R1C1);
+            OUStringBuffer aAsString;
+            aComp.CreateStringFromTokenArray(aAsString);
+            SAL_DEBUG("interpret formula: " << aAsString.makeStringAndClear());
+        }
+
         rCode.Reset();
         for (const formula::FormulaToken* p = rCode.First(); p; p = rCode.Next())
         {
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 857f045..d92a471 100755
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -275,10 +275,153 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
     return true;
 }
 
+/// Special case of formula compiler for groundwatering
+class FormulaGroupInterpreterGroundwater : public FormulaGroupInterpreterSoftware
+{
+public:
+    FormulaGroupInterpreterGroundwater() :
+        FormulaGroupInterpreterSoftware()
+    {
+        fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n");
+        OclCalc::InitEnv();
+    }
+    virtual ~FormulaGroupInterpreterGroundwater()
+    {
+        OclCalc::ReleaseOpenclRunEnv();
+    }
+
+    virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); }
+    virtual bool interpret(ScDocument& rDoc, const ScAddress& rTopPos,
+                           const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode);
+};
+
+#define RETURN_IF_FAIL(a,b) do { if (!(a)) { fprintf (stderr,b); return false; } } while (0)
+
+#include "compiler.hxx"
+
+// FIXME: really we should compile the formula and operate on the
+// RPN representation which -should- be more compact and have no Open / Close
+// or precedence issues; cf. rCode.FirstRPN() etc.
+bool FormulaGroupInterpreterGroundwater::interpret(ScDocument& rDoc, const ScAddress& rTopPos,
+                                                   const ScFormulaCellGroupRef& xGroup,
+                                                   ScTokenArray& rCode)
+{
+    // Inputs: both of length xGroup->mnLength
+    OpCode eOp; // type of operation: ocAverage, ocMax, ocMin
+    const double *pArrayToSubtractOneElementFrom;
+    const double *pGroundWaterDataArray;
+    size_t        nGroundWaterDataArrayLen;
+
+    // Output:
+    double *pResult = new double[xGroup->mnLength];
+    RETURN_IF_FAIL(pResult != NULL, "buffer alloc failed");
+    std::vector<double> aMatrixContent;
+
+    const formula::FormulaToken *p;
+
+    // special cased formula parser:
+
+    p = rCode.FirstNoSpaces();
+    RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocOpen, "no opening (");
+
+    {
+        p = rCode.NextNoSpaces();
+        RETURN_IF_FAIL(p != NULL, "no operator");
+
+        // Function:
+        eOp = p->GetOpCode();
+        RETURN_IF_FAIL(eOp == ocAverage || eOp == ocMax || eOp == ocMin, "unexpected opcode");
+
+        { // function arguments
+            p = rCode.NextNoSpaces();
+            RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocOpen, "missing opening (");
+
+            p = rCode.NextNoSpaces();
+            RETURN_IF_FAIL(p != NULL, "no function argument");
+            if (p->GetType() == formula::svDoubleVectorRef)
+            {
+                // FIXME: this is what I would expect; but table1.cxx's
+                // ScColumn::ResolveStaticReference as called from
+                // GroupTokenConverter::convert returns an ScMatrixToken un-conditionally
+                const formula::DoubleVectorRefToken* pDvr = static_cast<const formula::DoubleVectorRefToken*>(p);
+                const std::vector<const double*>& rArrays = pDvr->GetArrays();
+                RETURN_IF_FAIL(rArrays.size() == 1, "unexpectedly large double ref array");
+                RETURN_IF_FAIL(pDvr->GetArrayLength() == (size_t)xGroup->mnLength, "wrong double ref length");
+                RETURN_IF_FAIL(pDvr->IsStartFixed() && pDvr->IsEndFixed(), "non-fixed ranges )");
+                pGroundWaterDataArray = rArrays[0];
+                nGroundWaterDataArrayLen = xGroup->mnLength;
+            }
+            else
+            {
+                RETURN_IF_FAIL(p->GetType() == formula::svMatrix, "unexpected fn. param type");
+                const ScMatrixToken *pMatTok = static_cast<const ScMatrixToken *>(p);
+                pMatTok->GetMatrix()->GetDoubleArray( aMatrixContent );
+                // FIXME: horrible hackery: the legacy / excel shared formula oddness,
+                // such that the 1st entry is not truly shared, making these a different
+                // shape.
+                if (aMatrixContent.size() > (size_t)xGroup->mnLength + 1)
+                {
+                    fprintf(stderr, "Error size range mismatch: %ld vs %ld\n",
+                            (long)aMatrixContent.size(), (long)xGroup->mnLength);
+                    return false;
+                }
+                pGroundWaterDataArray = &aMatrixContent[0];
+                nGroundWaterDataArrayLen = aMatrixContent.size();
+            }
+
+            p = rCode.NextNoSpaces();
+            RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocClose, "missing closing )");
+        }
+
+        // Subtract operator
+        p = rCode.NextNoSpaces();
+        RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocSub, "missing subtract opcode");
+
+        { // subtract parameter
+            p = rCode.NextNoSpaces();
+            RETURN_IF_FAIL(p != NULL, "no tokens");
+            RETURN_IF_FAIL(p->GetType() == formula::svSingleVectorRef, "not a single ref");
+            const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>(p);
+            pArrayToSubtractOneElementFrom = pSvr->GetArray();
+            RETURN_IF_FAIL(pSvr->GetArrayLength() == (size_t)xGroup->mnLength, "wrong single ref length");
+        }
+
+        p = rCode.NextNoSpaces();
+        RETURN_IF_FAIL(p != NULL && p->GetOpCode() == ocClose, "missing closing )");
+    }
+
+    p = rCode.NextNoSpaces();
+    RETURN_IF_FAIL(p == NULL, "has 5th");
+
+    static OclCalc ocl_calc;
+
+    // Here we have all the data we need to dispatch our openCL kernel [ I hope ]
+    // so for:
+    //   =AVERAGE(L$6:L$7701) - L6
+    // we would get:
+    //   eOp => ocAverage
+    //   pGroundWaterDataArray => contains L$6:L$7701
+    //   pGroundWaterDataArrayLen => 7701 - 6 + 1
+    //   pArrayToSubtractOneElementFrom => contains L$5:L$7701 (overlapping)
+    //   length of this array -> xGroup->mnLength
+
+    fprintf (stderr, "Calculate !\n");
+
+    // Insert the double data, in rResult[i] back into the document
+    rDoc.SetFormulaResults(rTopPos, pResult, xGroup->mnLength);
+
+    delete [] pResult;
+    SAL_DEBUG ("exit cleanly !");
+    return true;
+}
+
 namespace opencl {
     sc::FormulaGroupInterpreter *createFormulaGroupInterpreter()
     {
-        return new sc::FormulaGroupInterpreterOpenCL();
+        if (getenv("SC_GROUNDWATER"))
+            return new sc::FormulaGroupInterpreterGroundwater();
+        else
+            return new sc::FormulaGroupInterpreterOpenCL();
     }
 } // namespace opencl
 
commit b13900ec67eca34307d8294a874b5946bffa110d
Author: Michael Meeks <michael.meeks at suse.com>
Date:   Mon Jul 8 11:57:39 2013 +0100

    avoid regular re-creation of the formulagroup interpreter.

diff --git a/sc/source/core/tool/formulagroup.cxx b/sc/source/core/tool/formulagroup.cxx
index 221a768..627c5f5 100644
--- a/sc/source/core/tool/formulagroup.cxx
+++ b/sc/source/core/tool/formulagroup.cxx
@@ -172,6 +172,7 @@ FormulaGroupInterpreter *FormulaGroupInterpreter::getStatic()
     if ( msInstance &&
          bOpenCLEnabled != ScInterpreter::GetGlobalConfig().mbOpenCLEnabled )
     {
+        bOpenCLEnabled = ScInterpreter::GetGlobalConfig().mbOpenCLEnabled;
         delete msInstance;
         msInstance = NULL;
     }
commit 65dcb6eba8009fef722a5cd714088107d6d37016
Author: Michael Meeks <michael.meeks at suse.com>
Date:   Mon Jul 8 10:49:05 2013 +0100

    Latest cleanup and improvements of opencl backend.
    
    Conflicts:
    	sc/source/core/opencl/openclwrapper.cxx
    
    Change-Id: I3fdc90570e90a156ccecb511fc04b473752018bd

diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
old mode 100644
new mode 100755
index 6a96129..857f045
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -65,35 +65,32 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& /* rMat
 bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& rTopPos,
                                               const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode)
 {
-    size_t rowSize = xGroup->mnLength; //, srcSize = 0;
+    size_t rowSize = xGroup->mnLength;
     fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize);
-    int *rangeStart =NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
-    int *rangeEnd = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
     // The row quantity can be gotten from p2->GetArrayLength()
-    int count1 =0,count2 =0,count3=0;
-    int oclOp=0;
-    double *srcData = NULL; // Point to the input data from CPU
-    double *rResult=NULL; // Point to the output data from GPU
-    double *leftData=NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
-    double *rightData=NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
-                            // The rightData can't be zero for "/"
-
-    leftData  = (double *)malloc(sizeof(double) * rowSize);
-    rightData = (double *)malloc(sizeof(double) * rowSize);
-    rResult   = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
-    srcData = (double *)calloc(rowSize,sizeof(double));
-
-    rangeStart =(int *)malloc(sizeof(int) * rowSize);
-    rangeEnd   =(int *)malloc(sizeof(int) * rowSize);
-
-    memset(rResult,0,rowSize);
-    if(NULL==leftData||NULL==rightData||
-           NULL==rResult||NULL==rangeStart||NULL==rangeEnd)
+    int nCount1 = 0, nCount2 = 0, nCount3 = 0;
+    int nOclOp = 0;
+    double *rResult = NULL; // Point to the output data from GPU
+    rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
+    if(NULL==rResult)
     {
         printf("malloc err\n");
         return false;
     }
-    // printf("rowSize is %d.\n",rowsize);
+    memset(rResult,0,rowSize);
+    float * fpOclSrcData = NULL; // Point to the input data from CPU
+    uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
+    uint * npOclEndPos     = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
+    float * fpLeftData     = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
+    float * fpRightData  = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
+                                 // The rightData can't be zero for "/"
+    static OclCalc ocl_calc;
+    // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
+    // Don't know which formulae will be used previously, so create buffers for different formulae used probably
+    ocl_calc.CreateBuffer(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
+    ocl_calc.CreateBuffer(fpLeftData,fpRightData,rowSize);
+    //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
+///////////////////////////////////////////////////////////////////////////////////////////
 
     // Until we implement group calculation for real, decompose the group into
     // individual formula token arrays for individual calculation.
@@ -125,26 +122,23 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
                     size_t nRowSize = nRowEnd - nRowStart + 1;
                     ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0));
 
-                    //srcSize = rowSize+nRowSize-rowSize%nRowSize;//align as nRowSize
-                    //srcData = (double *)calloc(srcSize,sizeof(double));
-                    rangeStart[i] = nRowStart;//record the start position
-                    rangeEnd[i] = nRowEnd;//record the end position
+                    npOclStartPos[i] = nRowStart; // record the start position
+                    npOclEndPos[i]     = nRowEnd;   // record the end position
 
                     for (size_t nCol = 0; nCol < nColSize; ++nCol)
                     {
                         const double* pArray = rArrays[nCol];
-
-                        //printf("pArray is %p.\n",pArray);
                         if( NULL==pArray )
                         {
                             fprintf(stderr,"Error: pArray is NULL!\n");
                             return false;
                         }
-                        //fprintf(stderr,"(rowSize+nRowSize-1) is %d.\n",rowSize+nRowSize-1);
+
                         for( size_t u=0; u<rowSize; u++ )
                         {
-                            srcData[u] = pArray[u];// note:rowSize<=srcSize
-                            //fprintf(stderr,"srcData[%d] is %f.\n",u,srcData[u]);
+                            // Many video cards can't support double type in kernel, so need transfer the double to float
+                            fpOclSrcData[u] = (float)pArray[u];
+                            //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
                         }
 
                         for (size_t nRow = 0; nRow < nRowSize; ++nRow)
@@ -177,26 +171,26 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
             OpCode eOp = pCur->GetOpCode();
             if(eOp==0)
             {
-                  if(count3%2==0)
-                    leftData[count1++] = pCur->GetDouble();
-                   else
-                    rightData[count2++] = pCur->GetDouble();
-                count3++;
-               }
-               else if( eOp!=ocOpen && eOp!=ocClose )
-                oclOp = eOp;
-
-//            if(count1>0){//dbg
-//                fprintf(stderr,"leftData is %f.\n",leftData[count1-1]);
-//                count1--;
-//            }
-//            if(count2>0){//dbg
-//                fprintf(stderr,"rightData is %f.\n",rightData[count2-1]);
-//                count2--;
-//            }
+                 if(nCount3%2==0)
+                     fpLeftData[nCount1++] = (float)pCur->GetDouble();
+                 else
+                     fpRightData[nCount2++] = (float)pCur->GetDouble();
+                 nCount3++;
+            }
+            else if( eOp!=ocOpen && eOp!=ocClose )
+                nOclOp = eOp;
+
+//              if(count1>0){//dbg
+//                  fprintf(stderr,"leftData is %f.\n",leftData[count1-1]);
+//                  count1--;
+//              }
+//              if(count2>0){//dbg
+//                  fprintf(stderr,"rightData is %f.\n",rightData[count2-1]);
+//                  count2--;
+//              }
         }
 
-        if(!getenv("SC_GPU"))
+        if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState())
         {
             fprintf(stderr,"ccCPU flow...\n\n");
             ScCompiler aComp(&rDoc, aTmpPos, aCode2);
@@ -211,34 +205,42 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
     } // for loop end (xGroup->mnLength)
 
     // For GPU calculation
-    if(getenv("SC_GPU"))
+    if(getenv("SC_GPU")&&ocl_calc.GetOpenclState())
     {
             fprintf(stderr,"ggGPU flow...\n\n");
-            printf(" oclOp is... %d\n",oclOp);
+            printf(" oclOp is... %d\n",nOclOp);
             osl_getSystemTime(&aTimeBefore); //timer
-            static OclCalc ocl_calc;
-            switch(oclOp)
+            switch(nOclOp)
             {
                 case ocAdd:
-                       ocl_calc.OclHostSignedAdd(leftData,rightData,rResult,count1);
+                    ocl_calc.OclHostSignedAdd32Bits(fpLeftData,fpRightData,rResult,nCount1);
                     break;
                 case ocSub:
-                    ocl_calc.OclHostSignedSub(leftData,rightData,rResult,count1);
+                    ocl_calc.OclHostSignedSub32Bits(fpLeftData,fpRightData,rResult,nCount1);
                     break;
                 case ocMul:
-                    ocl_calc.OclHostSignedMul(leftData,rightData,rResult,count1);
+                    ocl_calc.OclHostSignedMul32Bits(fpLeftData,fpRightData,rResult,nCount1);
                     break;
                 case ocDiv:
-                    ocl_calc.OclHostSignedDiv(leftData,rightData,rResult,count1);
+                    ocl_calc.OclHostSignedDiv32Bits(fpLeftData,fpRightData,rResult,nCount1);
                     break;
                 case ocMax:
-                    ocl_calc.OclHostFormulaMax(srcData,rangeStart,rangeEnd,rResult,rowSize);
+                    ocl_calc.OclHostFormulaMax32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocMin:
-                    ocl_calc.OclHostFormulaMin(srcData,rangeStart,rangeEnd,rResult,rowSize);
+                    ocl_calc.OclHostFormulaMin32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
                     break;
                 case ocAverage:
-                    ocl_calc.OclHostFormulaAverage(srcData,rangeStart,rangeEnd,rResult,rowSize);
+                    ocl_calc.OclHostFormulaAverage32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
+                    break;
+                case ocSum:
+                    //ocl_calc.OclHostFormulaSum(srcData,rangeStart,rangeEnd,rResult,rowSize);
+                    break;
+                case ocCount:
+                    //ocl_calc.OclHostFormulaCount(rangeStart,rangeEnd,rResult,rowSize);
+                    break;
+                case ocSumProduct:
+                    //ocl_calc.OclHostFormulaSumProduct(srcData,rangeStart,rangeEnd,rResult,rowSize);
                     break;
                 default:
                     fprintf(stderr,"No OpenCL function for this calculation.\n");
@@ -254,26 +256,16 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress&
 /////////////////////////////////////////////////////
 
 //rResult[i];
-//            for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results
-//                fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]);
-//            }
+//           for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results
+//               fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]);
+//           }
 
             // Insert the double data, in rResult[i] back into the document
             rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength);
         }
 
-        if(leftData)
-            free(leftData);
-        if(rightData)
-            free(rightData);
-        if(rangeStart)
-            free(rangeStart);
-        if(rangeEnd)
-            free(rangeEnd);
         if(rResult)
             free(rResult);
-        if(srcData)
-            free(srcData);
 
         if(getenv("SC_GPUSAMPLE")){
             //fprintf(stderr,"FormulaGroupInterpreter::interpret(),iniflag...%d\n",ocl_calc.GetOpenclState());
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
old mode 100644
new mode 100755
index 3269f3a..6c90126
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -6,153 +6,158 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+
 #ifndef _OCL_KERNEL_H_
 #define _OCL_KERNEL_H_
 
 #ifndef USE_EXTERNAL_KERNEL
 #define KERNEL( ... )# __VA_ARGS__
 
-
 /////////////////////////////////////////////
 const char *kernel_src = KERNEL(
 __kernel void hello(__global uint *buffer)
 
 {
-size_t idx = get_global_id(0);
-
-buffer[idx]=idx;
-
+    size_t idx = get_global_id(0);
+    buffer[idx]=idx;
 }
 
 __kernel void oclformula(__global float *data,
-					   const uint type)
+                       const uint type)
 {
-	const unsigned int i = get_global_id(0);
-
-	switch (type)
-	{
-		case 0:          //MAX
-		{
-			//printf("%i ?%f>%f\n",i,data[2*i],data[2*i+1]);
-			if(data[2*i]>data[2*i+1])
-				data[i] = data[2*i];
-			else
-				data[i] = data[2*i+1];
-			break;
-		}
-		case 1:          //MIN
-		{
-			//printf("%d ?%d<%d\n",i,data[2*i],data[2*i+1]);
-			if(data[2*i]<data[2*i+1])
-				data[i] = data[2*i];
-			else
-				data[i] = data[2*i+1];
-			break;
-		}
-		case 2:          //SUM
-		case 3:          //AVG
-		{
-			//printf("%d %d+%d\n",i,data[2*i],data[2*i+1]);
-			data[i] = data[2*i] + data[2*i+1];
-			break;
-		}
-		default:
-			break;
-
-	}
+    const unsigned int i = get_global_id(0);
+
+    switch (type)
+    {
+        case 0:          //MAX
+        {
+            //printf("%i ?%f>%f\n",i,data[2*i],data[2*i+1]);
+            if(data[2*i]>data[2*i+1])
+                data[i] = data[2*i];
+            else
+                data[i] = data[2*i+1];
+            break;
+        }
+        case 1:          //MIN
+        {
+            //printf("%d ?%d<%d\n",i,data[2*i],data[2*i+1]);
+            if(data[2*i]<data[2*i+1])
+                data[i] = data[2*i];
+            else
+                data[i] = data[2*i+1];
+            break;
+        }
+        case 2:          //SUM
+        case 3:          //AVG
+        {
+            //printf("%d %d+%d\n",i,data[2*i],data[2*i+1]);
+            data[i] = data[2*i] + data[2*i+1];
+            break;
+        }
+        default:
+            break;
+
+    }
 }
 
 
 __kernel void oclSignedAdd(__global float *ltData,__global float *rtData,__global float *otData)
 {
-	const unsigned int id = get_global_id(0);
-	otData[id] = ltData[id] + rtData[id];
+    const unsigned int id = get_global_id(0);
+    otData[id] = ltData[id] + rtData[id];
 }
 
 
 __kernel void oclSignedSub(__global float *ltData,__global float *rtData,__global float *otData)
 {
-	const unsigned int id = get_global_id(0);
-	otData[id] = ltData[id] - rtData[id];
+    const unsigned int id = get_global_id(0);
+    otData[id] = ltData[id] - rtData[id];
 
 }
 
 __kernel void oclSignedMul(__global float *ltData,__global float *rtData,__global float *otData)
 {
-	int id = get_global_id(0);
-	otData[id] =ltData[id] * rtData[id];
+    int id = get_global_id(0);
+    otData[id] =ltData[id] * rtData[id];
 }
 
 __kernel void oclSignedDiv(__global float *ltData,__global float *rtData,__global float *otData)
 {
-	const unsigned int id = get_global_id(0);
-	otData[id] = ltData[id] / rtData[id];
+    const unsigned int id = get_global_id(0);
+    otData[id] = ltData[id] / rtData[id];
 }
 
 __kernel void oclFormulaMin(__global float *input,__global int *start,__global int *end,__global float *output)
 {
-	const unsigned int id = get_global_id(0);
-	int i=0;
-	unsigned int startFlag = start[id];
-	unsigned int endFlag = end[id];
-	float min = input[startFlag];
-	for(i=startFlag;i<=endFlag;i++)
-	{
-		if(input[i]<min)
-			min = input[i];
-	}
-	output[id] = min;
+    const unsigned int id = get_global_id(0);
+    int i=0;
+    unsigned int startFlag = start[id];
+    unsigned int endFlag = end[id];
+    float min = input[startFlag];
+    for(i=startFlag;i<=endFlag;i++)
+    {
+        if(input[i]<min)
+            min = input[i];
+    }
+    output[id] = min;
 
 }
 
 __kernel void oclFormulaMax(__global float *input,__global int *start,__global int *end,__global float *output)
 {
-	const unsigned int id = get_global_id(0);
-	int i=0;
-	unsigned int startFlag = start[id];
-	unsigned int endFlag = end[id];
-	float max = input[startFlag];
-	for(i=startFlag;i<=endFlag;i++)
-	{
-		if(input[i]>max)
-			max = input[i];
-	}
-	output[id] = max;
+    const unsigned int id = get_global_id(0);
+    int i=0;
+    unsigned int startFlag = start[id];
+    unsigned int endFlag = end[id];
+    float max = input[startFlag];
+    for(i=startFlag;i<=endFlag;i++)
+    {
+        if(input[i]>max)
+            max = input[i];
+    }
+    output[id] = max;
 
 }
-
-__kernel void oclFormulaSum(__global float *data,
-					   const uint type)
+//Sum
+__kernel void oclFormulaSum(__global float *input,__global int *start,__global int *end,__global float *output)
 {
-
+    const unsigned int nId = get_global_id(0);
+    float fSum = 0.0f;
+    for(int i = start[nId]; i<=end[nId]; i++)
+        fSum += input[i];
+    output[nId] = fSum ;
 }
-
-__kernel void oclFormulaCount(__global float *data,
-					   const uint type)
+//Count
+__kernel void oclFormulaCount(__global int *start,__global int *end,__global float *output)
 {
-
+    const unsigned int nId = get_global_id(0);
+    output[nId] = end[nId] - start[nId] + 1;
 }
 
 __kernel void oclFormulaAverage(__global float *input,__global int *start,__global int *end,__global float *output)
 {
-	const unsigned int id = get_global_id(0);
-	int i=0;
-	float sum=0;
-	for(i = start[id];i<=end[id];i++)
-		sum += input[i];
-	output[id] = sum / (end[id]-start[id]+1);
+    const unsigned int id = get_global_id(0);
+    int i=0;
+    float sum=0;
+    for(i = start[id];i<=end[id];i++)
+        sum += input[i];
+    output[id] = sum / (end[id]-start[id]+1);
 
 }
 
-
-__kernel void oclFormulaSumproduct(__global float *data,
-					   const uint type)
+//Sumproduct
+__kernel void oclFormulaSumproduct(__global float *firstCol,__global float *secondCol,__global int *start,__global int *end,__global float *output)
 {
-
+    const int nId = get_global_id(0);
+    int nCount     = start[nId] - end[nId] + 1;
+    int nStartA  = start[nId*2];
+    int nStartB  = start[nId*2+1];
+    for(int i = 0; i<nCount; i++)
+        output[nId] += firstCol[nStartA+i]*secondCol[nStartB+i];
 }
 
 __kernel void oclFormulaMinverse(__global float *data,
-					   const uint type)
+                       const uint type)
 {
 
 }
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
old mode 100644
new mode 100755
index aa3ce69..597f370
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -22,6 +22,7 @@
 GPUEnv OpenclDevice::gpuEnv;
 int OpenclDevice::isInited =0;
 
+
 #ifdef SAL_WIN32
 
 #define OPENCL_DLL_NAME "opencllo.dll"
@@ -32,62 +33,62 @@ HINSTANCE HOpenclDll = NULL;
 
 int OpenclDevice::LoadOpencl()
 {
-	//fprintf(stderr, " LoadOpenclDllxx... \n");
-	OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
-	OpenclDll = LoadLibrary(OPENCL_DLL_NAME);
-	if (!static_cast<HINSTANCE>(OpenclDll))
-	{
-		fprintf(stderr, " Load opencllo.dll failed! \n");
-		FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
-		return OCLERR;
-	}
-	fprintf(stderr, " Load opencllo.dll successfully!\n");
-	return OCLSUCCESS;
+    //fprintf(stderr, " LoadOpenclDllxx... \n");
+    OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
+    OpenclDll = LoadLibrary(OPENCL_DLL_NAME);
+    if (!static_cast<HINSTANCE>(OpenclDll))
+    {
+        fprintf(stderr, " Load opencllo.dll failed! \n");
+        FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
+        return OCLERR;
+    }
+    fprintf(stderr, " Load opencllo.dll successfully!\n");
+    return OCLSUCCESS;
 }
 
 void OpenclDevice::FreeOpenclDll()
 {
-	fprintf(stderr, " Free opencllo.dll ... \n");
-	if(!static_cast<HINSTANCE>(OpenclDll))
-		FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
+    fprintf(stderr, " Free opencllo.dll ... \n");
+    if(!static_cast<HINSTANCE>(OpenclDll))
+        FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
 }
 #endif
 
 int OpenclDevice::InitEnv()
 {
 #ifdef SAL_WIN32
-	while(1)
+    while(1)
     {
-	    if(1==LoadOpencl())
-			break;
-	}
+        if(1==LoadOpencl())
+        break;
+    }
 #endif
-	InitOpenclRunEnv(0,NULL);
-	return 1;
+    InitOpenclRunEnv(0,NULL);
+    return 1;
 }
 
 int OpenclDevice::ReleaseOpenclRunEnv() {
-	ReleaseOpenclEnv(&gpuEnv);
+    ReleaseOpenclEnv(&gpuEnv);
 #ifdef SAL_WIN32
-	FreeOpenclDll();
+    FreeOpenclDll();
 #endif
     return 1;
 }
 ///////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////
 inline int OpenclDevice::AddKernelConfig(int kCount, const char *kName) {
-    strcpy(gpuEnv.kernelNames[kCount], kName);
-    gpuEnv.kernelCount++;
+    strcpy(gpuEnv.mArrykernelNames[kCount], kName);
+    gpuEnv.mnKernelCount++;
     return 0;
 }
 
 int OpenclDevice::RegistOpenclKernel() {
-    if (!gpuEnv.isUserCreated) {
+    if (!gpuEnv.mnIsUserCreated) {
         memset(&gpuEnv, 0, sizeof(gpuEnv));
     }
 
-    gpuEnv.fileCount = 0; //argc;
-    gpuEnv.kernelCount = 0UL;
+    gpuEnv.mnFileCount = 0; //argc;
+    gpuEnv.mnKernelCount = 0UL;
 
     AddKernelConfig(0, (const char*) "hello");
     AddKernelConfig(1, (const char*) "oclformula");
@@ -99,34 +100,39 @@ int OpenclDevice::RegistOpenclKernel() {
     AddKernelConfig(7, (const char*) "oclFormulaSumproduct");
     AddKernelConfig(8, (const char*) "oclFormulaMinverse");
 
-    AddKernelConfig(9,  (const char*) "oclSignedAdd");
+    AddKernelConfig(9,    (const char*) "oclSignedAdd");
     AddKernelConfig(10, (const char*) "oclSignedSub");
     AddKernelConfig(11, (const char*) "oclSignedMul");
     AddKernelConfig(12, (const char*) "oclSignedDiv");
-	return 0;
+    return 0;
 }
 OpenclDevice::OpenclDevice(){
-	//InitEnv();
+    //InitEnv();
 }
 
 OpenclDevice::~OpenclDevice() {
-	//ReleaseOpenclRunEnv();
+    //ReleaseOpenclRunEnv();
 }
 
+int OpenclDevice::SetKernelEnv(KernelEnv *envInfo)
+{
+    envInfo->mpkContext  = gpuEnv.mpContext;
+    envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
+    envInfo->mpkProgram  = gpuEnv.mpArryPrograms[0];
+
+    return 1;
+}
 int OpenclDevice::CheckKernelName(KernelEnv *envInfo,const char *kernelName){
     //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
     int kCount;
-    for(kCount=0; kCount < gpuEnv.kernelCount; kCount++) {
-        if(strcasecmp(kernelName, gpuEnv.kernelNames[kCount]) == 0) {
-	    printf("match  %s kernel right\n",kernelName);
-	    break;
+    for(kCount=0; kCount < gpuEnv.mnKernelCount; kCount++) {
+        if(strcasecmp(kernelName, gpuEnv.mArrykernelNames[kCount]) == 0) {
+        printf("match  %s kernel right\n",kernelName);
+        break;
         }
     }
-    envInfo->context      = gpuEnv.context;
-    envInfo->commandQueue = gpuEnv.commandQueue;
-    envInfo->program      = gpuEnv.programs[0];
-    envInfo->kernel       = gpuEnv.kernels[kCount];
-    strcpy(envInfo->kernelName, kernelName);
+    envInfo->mpkKernel = gpuEnv.mpArryKernels[kCount];
+    strcpy(envInfo->mckKernelName, kernelName);
     if (envInfo == (KernelEnv *) NULL)
     {
         printf("get err func and env\n");
@@ -145,7 +151,7 @@ int OpenclDevice::ConvertToString(const char *filename, char **source) {
     printf("open kernel file %s.\n",filename);
 
     if (file != NULL) {
-		printf("Open ok!\n");
+        printf("Open ok!\n");
         fseek(file, 0, SEEK_END);
 
         file_size = ftell(file);
@@ -169,35 +175,35 @@ int OpenclDevice::ConvertToString(const char *filename, char **source) {
 }
 
 int OpenclDevice::BinaryGenerated(const char * clFileName, FILE ** fhandle) {
-    unsigned int i = 0;
-	cl_int status;
-	char *str = NULL;
-	FILE *fd = NULL;
-	cl_uint numDevices=0;
-	status = clGetDeviceIDs(gpuEnv.platform, // platform
-                            CL_DEVICE_TYPE_ALL, // device_type
-							0, // num_entries
-							NULL, // devices
-							&numDevices);
-	for (i = 0; i <numDevices; i++) {
-		char fileName[256] = { 0 }, cl_name[128] = { 0 };
-		if (gpuEnv.devices[i] != 0) {
-			char deviceName[1024];
-			status = clGetDeviceInfo(gpuEnv.devices[i], CL_DEVICE_NAME,sizeof(deviceName), deviceName, NULL);
-			CHECK_OPENCL(status);
-			str = (char*) strstr(clFileName, (char*) ".cl");
-			memcpy(cl_name, clFileName, str - clFileName);
-			cl_name[str - clFileName] = '\0';
-			sprintf(fileName, "./%s-%s.bin", cl_name, deviceName);
-			fd = fopen(fileName, "rb");
-			status = (fd != NULL) ? 1 : 0;
-			}
-		}
-		if (fd != NULL) {
-			*fhandle = fd;
-			}
-
-		return status;
+        unsigned int i = 0;
+    cl_int status;
+    char *str = NULL;
+    FILE *fd = NULL;
+    cl_uint numDevices=0;
+    status = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+                            CL_DEVICE_TYPE_GPU, // device_type
+                            0, // num_entries
+                            NULL, // devices ID
+                            &numDevices);
+    for (i = 0; i <numDevices; i++) {
+        char fileName[256] = { 0 }, cl_name[128] = { 0 };
+        if (gpuEnv.mpArryDevsID[i] != 0) {
+            char deviceName[1024];
+            status = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,sizeof(deviceName), deviceName, NULL);
+            CHECK_OPENCL(status);
+            str = (char*) strstr(clFileName, (char*) ".cl");
+            memcpy(cl_name, clFileName, str - clFileName);
+            cl_name[str - clFileName] = '\0';
+            sprintf(fileName, "./%s-%s.bin", cl_name, deviceName);
+            fd = fopen(fileName, "rb");
+            status = (fd != NULL) ? 1 : 0;
+            }
+        }
+        if (fd != NULL) {
+            *fhandle = fd;
+            }
+
+        return status;
 
 }
 
@@ -220,22 +226,21 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
         const char * clFileName) {
      unsigned int i = 0;
     cl_int status;
-    size_t *binarySizes;
-    cl_uint numDevices;
-    cl_device_id *devices;
+    size_t *binarySizes, numDevices;
+    cl_device_id *mpArryDevsID;
     char **binaries, *str = NULL;
 
     status = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
             sizeof(numDevices), &numDevices, NULL);
     CHECK_OPENCL(status)
 
-    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * numDevices);
-    if (devices == NULL) {
+    mpArryDevsID = (cl_device_id*) malloc(sizeof(cl_device_id) * numDevices);
+    if (mpArryDevsID == NULL) {
         return 0;
     }
     /* grab the handles to all of the devices in the program. */
     status = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
-            sizeof(cl_device_id) * numDevices, devices, NULL);
+            sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL);
     CHECK_OPENCL(status)
 
     /* figure out the sizes of each of the binaries. */
@@ -272,7 +277,7 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
 
         if (binarySizes[i] != 0) {
             char deviceName[1024];
-            status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME,
+            status = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
                     sizeof(deviceName), deviceName, NULL);
             CHECK_OPENCL(status)
 
@@ -307,24 +312,24 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program,
         binarySizes = NULL;
     }
 
-    if (devices != NULL) {
-        free(devices);
-        devices = NULL;
+    if (mpArryDevsID != NULL) {
+        free(mpArryDevsID);
+        mpArryDevsID = NULL;
     }
     return 1;
 }
 
 int OpenclDevice::InitOpenclAttr(OpenCLEnv * env) {
-    if (gpuEnv.isUserCreated) {
+    if (gpuEnv.mnIsUserCreated) {
         return 1;
     }
 
-    gpuEnv.context = env->context;
-    gpuEnv.platform = env->platform;
-    gpuEnv.dev = env->devices;
-    gpuEnv.commandQueue = env->commandQueue;
+    gpuEnv.mpContext    = env->mpOclContext;
+    gpuEnv.mpPlatformID = env->mpOclPlatformID;
+    gpuEnv.mpDevID        = env->mpOclDevsID;
+    gpuEnv.mpCmdQueue    = env->mpOclCmdQueue;
 
-    gpuEnv.isUserCreated = 1;
+    gpuEnv.mnIsUserCreated = 1;
 
     return 0;
 }
@@ -332,14 +337,14 @@ int OpenclDevice::InitOpenclAttr(OpenCLEnv * env) {
 int OpenclDevice::CreateKernel(char * kernelname, KernelEnv * env) {
     int status;
 
-    env->kernel = clCreateKernel(gpuEnv.programs[0], kernelname, &status);
-    env->context = gpuEnv.context;
-    env->commandQueue = gpuEnv.commandQueue;
+    env->mpkKernel     = clCreateKernel(gpuEnv.mpArryPrograms[0], kernelname, &status);
+    env->mpkContext  = gpuEnv.mpContext;
+    env->mpkCmdQueue = gpuEnv.mpCmdQueue;
     return status != CL_SUCCESS ? 1 : 0;
 }
 
 int OpenclDevice::ReleaseKernel(KernelEnv * env) {
-    int status = clReleaseKernel(env->kernel);
+    int status = clReleaseKernel(env->mpkKernel);
     return status != CL_SUCCESS ? 1 : 0;
 }
 
@@ -351,24 +356,24 @@ int OpenclDevice::ReleaseOpenclEnv(GPUEnv *gpuInfo) {
         return 1;
     }
 
-    for (i = 0; i < gpuEnv.fileCount; i++) {
-        if (gpuEnv.programs[i]) {
-            status = clReleaseProgram(gpuEnv.programs[i]);
+    for (i = 0; i < gpuEnv.mnFileCount; i++) {
+        if (gpuEnv.mpArryPrograms[i]) {
+            status = clReleaseProgram(gpuEnv.mpArryPrograms[i]);
             CHECK_OPENCL(status)
-            gpuEnv.programs[i] = NULL;
+            gpuEnv.mpArryPrograms[i] = NULL;
         }
     }
-    if (gpuEnv.commandQueue) {
-        clReleaseCommandQueue(gpuEnv.commandQueue);
-        gpuEnv.commandQueue = NULL;
+    if (gpuEnv.mpCmdQueue) {
+        clReleaseCommandQueue(gpuEnv.mpCmdQueue);
+        gpuEnv.mpCmdQueue = NULL;
     }
-    if (gpuEnv.context) {
-        clReleaseContext(gpuEnv.context);
-        gpuEnv.context = NULL;
+    if (gpuEnv.mpContext) {
+        clReleaseContext(gpuEnv.mpContext);
+        gpuEnv.mpContext = NULL;
     }
     isInited = 0;
-    gpuInfo->isUserCreated = 0;
-    free(gpuInfo->devices);
+    gpuInfo->mnIsUserCreated = 0;
+    free(gpuInfo->mpArryDevsID);
     return 1;
 }
 
@@ -386,9 +391,9 @@ int OpenclDevice::RunKernelWrapper(cl_kernel_function function,
 int OpenclDevice::CachedOfKernerPrg(const GPUEnv *gpuEnvCached,
         const char * clFileName) {
   int i;
-    for (i = 0; i < gpuEnvCached->fileCount; i++) {
-        if (strcasecmp(gpuEnvCached->kernelSrcFile[i], clFileName) == 0) {
-            if (gpuEnvCached->programs[i] != NULL) {
+    for (i = 0; i < gpuEnvCached->mnFileCount; i++) {
+        if (strcasecmp(gpuEnvCached->mArryKnelSrcFile[i], clFileName) == 0) {
+            if (gpuEnvCached->mpArryPrograms[i] != NULL) {
                 return 1;
             }
         }
@@ -404,31 +409,28 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
     const char *source;
     size_t source_size[1];
     int b_error, binary_status, binaryExisted, idx;
-    cl_uint numDevices;
-    cl_device_id *devices;
+    size_t numDevices;
+    cl_device_id *mpArryDevsID;
     FILE *fd, *fd1;
     const char* filename = "kernel.cl";
-	fprintf(stderr, "CompileKernelFile ... \n");
+    fprintf(stderr, "CompileKernelFile ... \n");
     if (CachedOfKernerPrg(gpuInfo, filename) == 1) {
         return 1;
     }
 
-    idx = gpuInfo->fileCount;
+    idx = gpuInfo->mnFileCount;
 
     source = kernel_src;
 
     source_size[0] = strlen(source);
     binaryExisted = 0;
     if ((binaryExisted = BinaryGenerated(filename, &fd)) == 1) {
-#ifdef CL_CONTEXT_NUM_DEVICES
-        status = clGetContextInfo(gpuInfo->context, CL_CONTEXT_NUM_DEVICES,
+        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
                 sizeof(numDevices), &numDevices, NULL);
         CHECK_OPENCL(status)
-#else
-        numDevices = 1; // ???
-#endif
-        devices = (cl_device_id*) malloc(sizeof(cl_device_id) * numDevices);
-        if (devices == NULL) {
+
+        mpArryDevsID = (cl_device_id*) malloc(sizeof(cl_device_id) * numDevices);
+        if (mpArryDevsID == NULL) {
             return 0;
         }
 
@@ -452,50 +454,50 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
         fclose(fd);
         fd = NULL;
         // grab the handles to all of the devices in the context.
-        status = clGetContextInfo(gpuInfo->context, CL_CONTEXT_DEVICES,
-                sizeof(cl_device_id) * numDevices, devices, NULL);
+        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
+                sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL);
         CHECK_OPENCL(status)
 
-        gpuInfo->programs[idx] = clCreateProgramWithBinary(gpuInfo->context,
-                numDevices, devices, &length, (const unsigned char**) &binary,
+        gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary(gpuInfo->mpContext,
+                numDevices, mpArryDevsID, &length, (const unsigned char**) &binary,
                 &binary_status, &status);
         CHECK_OPENCL(status)
 
         free(binary);
-        free(devices);
-        devices = NULL;
+        free(mpArryDevsID);
+        mpArryDevsID = NULL;
     } else {
         // create a CL program using the kernel source
-        gpuEnv.programs[idx] = clCreateProgramWithSource(gpuEnv.context,
+        gpuEnv.mpArryPrograms[idx] = clCreateProgramWithSource(gpuEnv.mpContext,
                 1, &source, source_size, &status);
         CHECK_OPENCL(status);
     }
 
-    if (gpuInfo->programs[idx] == (cl_program) NULL) {
+    if (gpuInfo->mpArryPrograms[idx] == (cl_program) NULL) {
         return 0;
     }
 
     //char options[512];
     // create a cl program executable for all the devices specified
-    if (!gpuInfo->isUserCreated) {
-        status = clBuildProgram(gpuInfo->programs[idx], 1, gpuInfo->devices,
+    if (!gpuInfo->mnIsUserCreated) {
+        status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
                 buildOption, NULL, NULL);
         CHECK_OPENCL(status)
     } else {
-        status = clBuildProgram(gpuInfo->programs[idx], 1, &(gpuInfo->dev),
+        status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
                 buildOption, NULL, NULL);
         CHECK_OPENCL(status)
     }
     printf("BuildProgram.\n");
 
     if (status != CL_SUCCESS) {
-        if (!gpuInfo->isUserCreated) {
-            status = clGetProgramBuildInfo(gpuInfo->programs[idx],
-                    gpuInfo->devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL,
+        if (!gpuInfo->mnIsUserCreated) {
+            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
+                    gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, 0, NULL,
                     &length);
         } else {
-            status = clGetProgramBuildInfo(gpuInfo->programs[idx],
-                    gpuInfo->dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
+            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
+                    gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
         }
         if (status != CL_SUCCESS) {
             printf("opencl create build log fail\n");
@@ -505,13 +507,13 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
         if (buildLog == (char*) NULL) {
             return 0;
         }
-        if (!gpuInfo->isUserCreated) {
-            status = clGetProgramBuildInfo(gpuInfo->programs[idx],
-                    gpuInfo->devices[0], CL_PROGRAM_BUILD_LOG, length,
+        if (!gpuInfo->mnIsUserCreated) {
+            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
+                    gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, length,
                     buildLog, &length);
         } else {
-            status = clGetProgramBuildInfo(gpuInfo->programs[idx],
-                    gpuInfo->dev, CL_PROGRAM_BUILD_LOG, length, buildLog,
+            status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx],
+                    gpuInfo->mpDevID, CL_PROGRAM_BUILD_LOG, length, buildLog,
                     &length);
         }
 
@@ -525,12 +527,12 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
         return 0;
     }
 
-    strcpy(gpuEnv.kernelSrcFile[idx], filename);
+    strcpy(gpuEnv.mArryKnelSrcFile[idx], filename);
 
     if (binaryExisted == 0)
-        GeneratBinFromKernelSource(gpuEnv.programs[idx], filename);
+        GeneratBinFromKernelSource(gpuEnv.mpArryPrograms[idx], filename);
 
-    gpuInfo->fileCount += 1;
+    gpuInfo->mnFileCount += 1;
 
     return 1;
 
@@ -539,14 +541,14 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) {
 int OpenclDevice::GetKernelEnvAndFunc(const char *kernelName,
         KernelEnv *env, cl_kernel_function *function) {
     int i; //,program_idx ;
-    printf("----------------OpenclDevice::GetKernelEnvAndFunc\n");
-    for (i = 0; i < gpuEnv.kernelCount; i++) {
-        if (strcasecmp(kernelName, gpuEnv.kernelNames[i]) == 0) {
-            env->context = gpuEnv.context;
-            env->commandQueue = gpuEnv.commandQueue;
-            env->program = gpuEnv.programs[0];
-            env->kernel = gpuEnv.kernels[i];
-            *function = gpuEnv.kernelFunctions[i];
+    //printf("----------------OpenclDevice::GetKernelEnvAndFunc\n");
+    for (i = 0; i < gpuEnv.mnKernelCount; i++) {
+        if (strcasecmp(kernelName, gpuEnv.mArrykernelNames[i]) == 0) {
+            env->mpkContext = gpuEnv.mpContext;
+            env->mpkCmdQueue = gpuEnv.mpCmdQueue;
+            env->mpkProgram = gpuEnv.mpArryPrograms[0];
+            env->mpkKernel = gpuEnv.mpArryKernels[i];
+            *function = gpuEnv.mpArryKnelFuncs[i];
             return 1;
         }
     }
@@ -554,21 +556,21 @@ int OpenclDevice::GetKernelEnvAndFunc(const char *kernelName,
 }
 
 int OpenclDevice::RunKernel(const char *kernelName, void **userdata) {
-    KernelEnv env;
+    KernelEnv kEnv;
 
     cl_kernel_function function;
 
     int status;
 
-    memset(&env, 0, sizeof(KernelEnv));
-    status = GetKernelEnvAndFunc(kernelName, &env, &function);
-    strcpy(env.kernelName, kernelName);
+    memset(&kEnv, 0, sizeof(KernelEnv));
+    status = GetKernelEnvAndFunc(kernelName, &kEnv, &function);
+    strcpy(kEnv.mckKernelName, kernelName);
     if (status == 1) {
-        if (&env == (KernelEnv *) NULL
+        if (&kEnv == (KernelEnv *) NULL
                 || &function == (cl_kernel_function *) NULL) {
             return 0;
         }
-        return (function(userdata, &env));
+        return (function(userdata, &kEnv));
     }
     return 0;
 }
@@ -593,7 +595,7 @@ int OpenclDevice::InitOpenclRunEnv(int argc, const char *buildOptionKernelfiles)
         printf("init_opencl_env successed.\n");
         //initialize program, kernelName, kernelCount
         status = CompileKernelFile( &gpuEnv, buildOptionKernelfiles);
-        if (status == 0 || gpuEnv.kernelCount == 0) {
+        if (status == 0 || gpuEnv.mnKernelCount == 0) {
             printf("CompileKernelFile failed.\n");
             return 1;
         }
@@ -615,12 +617,12 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
 
     // Have a look at the available platforms.
 
-    if (!gpuInfo->isUserCreated) {
+    if (!gpuInfo->mnIsUserCreated) {
         status = clGetPlatformIDs(0, NULL, &numPlatforms);
         if (status != CL_SUCCESS) {
             return 1;
         }
-        gpuInfo->platform = NULL;
+        gpuInfo->mpPlatformID = NULL;
 
         if (0 < numPlatforms) {
             platforms = (cl_platform_id*) malloc(
@@ -641,18 +643,18 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
                 if (status != CL_SUCCESS) {
                     return 1;
                 }
-                gpuInfo->platform = platforms[i];
+                gpuInfo->mpPlatformID = platforms[i];
 
                 //if (!strcmp(platformName, "Intel(R) Coporation"))
                 //if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
                 {
-                    gpuInfo->platform = platforms[i];
+                    gpuInfo->mpPlatformID = platforms[i];
 
-                    status = clGetDeviceIDs(gpuInfo->platform, // platform
-                                                CL_DEVICE_TYPE_ALL, // device_type
-												0, // num_entries
-												NULL, // devices
-												&numDevices);
+                    status = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+                                            CL_DEVICE_TYPE_GPU,    // device_type
+                                            0,                       // num_entries
+                                            NULL,                   // devices
+                                            &numDevices);
 
                     if (status != CL_SUCCESS) {
                         return 1;
@@ -665,82 +667,82 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo)
             }
             free(platforms);
         }
-        if (NULL == gpuInfo->platform) {
+        if (NULL == gpuInfo->mpPlatformID) {
             return 1;
         }
 
         // Use available platform.
 
         cps[0] = CL_CONTEXT_PLATFORM;
-        cps[1] = (cl_context_properties) gpuInfo->platform;
+        cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
         cps[2] = 0;
         // Check for GPU.
-        gpuInfo->dType = CL_DEVICE_TYPE_GPU;
-        gpuInfo->context = clCreateContextFromType(cps, gpuInfo->dType, NULL,
+        gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
+        gpuInfo->mpContext = clCreateContextFromType(cps, gpuInfo->mDevType, NULL,
                 NULL, &status);
 
-        // If no GPU, check for CPU.
-        if ((gpuInfo->context == (cl_context) NULL)
+        if ((gpuInfo->mpContext == (cl_context) NULL)
                 || (status != CL_SUCCESS)) {
-            gpuInfo->dType = CL_DEVICE_TYPE_CPU;
-            gpuInfo->context = clCreateContextFromType(cps, gpuInfo->dType,
+            gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
+            gpuInfo->mpContext = clCreateContextFromType(cps, gpuInfo->mDevType,
                     NULL, NULL, &status);
         }
-
-        // If no GPU or CPU, check for a "default" type.
-        if ((gpuInfo->context == (cl_context) NULL)
+        if ((gpuInfo->mpContext == (cl_context) NULL)
                 || (status != CL_SUCCESS)) {
-            gpuInfo->dType = CL_DEVICE_TYPE_DEFAULT;
-            gpuInfo->context = clCreateContextFromType(cps, gpuInfo->dType,
+            gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
+            gpuInfo->mpContext = clCreateContextFromType(cps, gpuInfo->mDevType,
                     NULL, NULL, &status);
         }
-        if ((gpuInfo->context == (cl_context) NULL)
+        if ((gpuInfo->mpContext == (cl_context) NULL)
                 || (status != CL_SUCCESS)) {
             return 1;
         }
         // Detect OpenCL devices.
         // First, get the size of device list data
-        status = clGetContextInfo(gpuInfo->context, CL_CONTEXT_DEVICES, 0,
+        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0,
                 NULL, &length);
         if ((status != CL_SUCCESS) || (length == 0)) {
             return 1;
         }
         // Now allocate memory for device list based on the size we got earlier
-        gpuInfo->devices = (cl_device_id*) malloc(length);
-        if (gpuInfo->devices == (cl_device_id*) NULL) {
+        gpuInfo->mpArryDevsID = (cl_device_id*) malloc(length);
+        if (gpuInfo->mpArryDevsID == (cl_device_id*) NULL) {
             return 1;
         }
         // Now, get the device list data
-        status = clGetContextInfo(gpuInfo->context, CL_CONTEXT_DEVICES, length,
-                gpuInfo->devices, NULL);
+        status = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
+                gpuInfo->mpArryDevsID, NULL);
         if (status != CL_SUCCESS) {
             return 1;
         }
 
         // Create OpenCL command queue.
-        gpuInfo->commandQueue = clCreateCommandQueue(gpuInfo->context,
-                gpuInfo->devices[0], 0, &status);
+        gpuInfo->mpCmdQueue = clCreateCommandQueue(gpuInfo->mpContext,
+                gpuInfo->mpArryDevsID[0], 0, &status);
 
         if (status != CL_SUCCESS) {
             return 1;
         }
     }
 
+    status = clGetCommandQueueInfo(gpuInfo->mpCmdQueue,
+            CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL);
+
     return 0;
 
 }
 int OpenclDevice::RegisterKernelWrapper(const char *kernelName,cl_kernel_function function)
 {
-	int i;
-	printf("oclwrapper:RegisterKernelWrapper...%d\n", gpuEnv.kernelCount);
-	for (i = 0; i < gpuEnv.kernelCount; i++)
-	{
-		if (strcasecmp(kernelName, gpuEnv.kernelNames[i]) == 0)
-		{
-			gpuEnv.kernelFunctions[i] = function;
-			return 1;
-		}
-	}
+    int i;
+    //printf("oclwrapper:RegisterKernelWrapper...%d\n", gpuEnv.mnKernelCount);
+    for (i = 0; i < gpuEnv.mnKernelCount; i++)
+    {
+        if (strcasecmp(kernelName, gpuEnv.mArrykernelNames[i]) == 0)
+        {
+            gpuEnv.mpArryKnelFuncs[i] = function;
+            return 1;
+        }
+    }
     return 0;
 }
 
@@ -772,20 +774,20 @@ int OclFormulax(void ** usrdata, KernelEnv *env) {
         tdata[i] = (float) data[i];
     }
 
-    env->kernel = clCreateKernel(env->program, "oclformula", &clStatus);
+    env->mpkKernel = clCreateKernel(env->mpkProgram, "oclformula", &clStatus);
     //printf("ScInterpreter::IterateParameters...after clCreateKernel.\n");
     //fprintf(stderr, "\nIn OpenclDevice,...after clCreateKernel\n");
     int size = NUM;
 
-    cl_mem formula_data = clCreateBuffer(env->context,
+    cl_mem formula_data = clCreateBuffer(env->mpkContext,
             (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR),
             size * sizeof(float), (void *) tdata, &clStatus);
     //fprintf(stderr, "\nIn OpenclDevice,...after clCreateBuffer\n");
 
-    status = clSetKernelArg(env->kernel, 0, sizeof(cl_mem),
+    status = clSetKernelArg(env->mpkKernel, 0, sizeof(cl_mem),
             (void *) &formula_data);
     CHECK_OPENCL(status)
-    status = clSetKernelArg(env->kernel, 1, sizeof(unsigned int),
+    status = clSetKernelArg(env->mpkKernel, 1, sizeof(unsigned int),
             (void *) &type);
     CHECK_OPENCL(status)
 
@@ -795,21 +797,21 @@ int OclFormulax(void ** usrdata, KernelEnv *env) {
 
     while (global_work_size[0] != 1) {
         global_work_size[0] = global_work_size[0] / 2;
-        status = clEnqueueNDRangeKernel(env->commandQueue, env->kernel, 1,
+        status = clEnqueueNDRangeKernel(env->mpkCmdQueue, env->mpkKernel, 1,
                 NULL, global_work_size, NULL, 0, NULL, NULL);
         CHECK_OPENCL(status)
 
     }
     //fprintf(stderr, "\nIn OpenclDevice,...before clEnqueueReadBuffer\n");
-    status = clEnqueueReadBuffer(env->commandQueue, formula_data, CL_FALSE, 0,
+    status = clEnqueueReadBuffer(env->mpkCmdQueue, formula_data, CL_FALSE, 0,
             sizeof(float), (void *) &tdata, 0, NULL, NULL);
     CHECK_OPENCL(status)
-    status = clFinish(env->commandQueue);
+    status = clFinish(env->mpkCmdQueue);
     CHECK_OPENCL(status)
 
     //PPAStopCpuEvent(ppa_proc);
     //fprintf(stderr, "\nIn OpenclDevice,...before clReleaseKernel\n");
-    status = clReleaseKernel(env->kernel);
+    status = clReleaseKernel(env->mpkKernel);
     CHECK_OPENCL(status)
     status = clReleaseMemObject(formula_data);
     CHECK_OPENCL(status)
@@ -840,20 +842,20 @@ int OclFormulaxDll(void ** usrdata, KernelEnv *env) {
         tdata[i] = (float) data[i];
     }
 
-    env->kernel = clCreateKernel(env->program, "oclformula", &clStatus);
+    env->mpkKernel = clCreateKernel(env->mpkProgram, "oclformula", &clStatus);
     //printf("ScInterpreter::IterateParameters...after clCreateKernel.\n");
     //fprintf(stderr, "\nIn OpenclDevice,...after clCreateKernel\n");
     int size = NUM;
 
-    cl_mem formula_data = clCreateBuffer(env->context,
+    cl_mem formula_data = clCreateBuffer(env->mpkContext,
             (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR),
             size * sizeof(float), (void *) tdata, &clStatus);
     //fprintf(stderr, "\nIn OpenclDevice,...after clCreateBuffer\n");
 
-    status = clSetKernelArg(env->kernel, 0, sizeof(cl_mem),
+    status = clSetKernelArg(env->mpkKernel, 0, sizeof(cl_mem),
             (void *) &formula_data);
     CHECK_OPENCL(status)
-    status = clSetKernelArg(env->kernel, 1, sizeof(unsigned int),
+    status = clSetKernelArg(env->mpkKernel, 1, sizeof(unsigned int),
             (void *) &type);
     CHECK_OPENCL(status)
 
@@ -863,21 +865,21 @@ int OclFormulaxDll(void ** usrdata, KernelEnv *env) {
 
     while (global_work_size[0] != 1) {
         global_work_size[0] = global_work_size[0] / 2;
-        status = clEnqueueNDRangeKernel(env->commandQueue, env->kernel, 1,
+        status = clEnqueueNDRangeKernel(env->mpkCmdQueue, env->mpkKernel, 1,
                 NULL, global_work_size, NULL, 0, NULL, NULL);
         CHECK_OPENCL(status)
 
     }
     //fprintf(stderr, "\nIn OpenclDevice,...before clEnqueueReadBuffer\n");
-    status = clEnqueueReadBuffer(env->commandQueue, formula_data, CL_FALSE, 0,
+    status = clEnqueueReadBuffer(env->mpkCmdQueue, formula_data, CL_FALSE, 0,
             sizeof(float), (void *) &tdata, 0, NULL, NULL);
     CHECK_OPENCL(status)
-    status = clFinish(env->commandQueue);
+    status = clFinish(env->mpkCmdQueue);
     CHECK_OPENCL(status)
 
     //PPAStopCpuEvent(ppa_proc);
     //fprintf(stderr, "\nIn OpenclDevice,...before clReleaseKernel\n");
-    status = clReleaseKernel(env->kernel);
+    status = clReleaseKernel(env->mpkKernel);
     CHECK_OPENCL(status)
     status = clReleaseMemObject(formula_data);
     CHECK_OPENCL(status)
@@ -894,13 +896,13 @@ int OclFormulaxDll(void ** usrdata, KernelEnv *env) {
 
 double OclCalc::OclProcess(cl_kernel_function function, double *data, formulax type)
 {
-	fprintf(stderr, "\n OpenclDevice, proc...begin\n");
-	double ret = 0;
-	void *usrdata[2];
-	usrdata[0] = (void *) data;
-	usrdata[1] = (void *) &type;
-	RunKernelWrapper(function, "oclformula", usrdata);
-	return ret;
+    fprintf(stderr, "\n OpenclDevice, proc...begin\n");
+    double ret = 0;
+    void *usrdata[2];
+    usrdata[0] = (void *) data;
+    usrdata[1] = (void *) &type;
+    RunKernelWrapper(function, "oclformula", usrdata);
+    return ret;
 }
 
 double OclCalc::OclTest() {
@@ -927,467 +929,1141 @@ double OclCalc::OclTestDll() {
 
 OclCalc::OclCalc()
 {
-    OpenclDevice::SetOpenclState(1);
-    fprintf(stderr,"OclCalc:: init opencl ok.\n");
+    fprintf(stderr,"OclCalc:: init opencl ...\n");
 }
 
 OclCalc::~OclCalc()
 {
-    OpenclDevice::SetOpenclState(0);
-    fprintf(stderr,"OclCalc:: opencl end ok.\n");
+    fprintf(stderr,"OclCalc:: opencl end ...\n");
 }
 
 /////////////////////////////////////////////////////////////////////////////
-int OclCalc::OclHostFormulaMax(double *srcData,int *start,int *end,double *output,int size) {
-	KernelEnv env;
-	const char *kernelName = "oclFormulaMax";
-	CheckKernelName(&env,kernelName);
-	cl_int clStatus;
-	size_t global_work_size[1];
-	int alignSize = size + end[0]-start[0];
-
-	env.kernel = clCreateKernel(env.program,kernelName, &clStatus);
-	cl_int ret=0;
-	cl_mem inputCl = clCreateBuffer(env.context,(cl_mem_flags) (CL_MEM_READ_WRITE),
-		alignSize * sizeof(float), NULL, &clStatus);
-	cl_mem startCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-		size * sizeof(unsigned int), NULL, &ret);
-	cl_mem endCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-		size * sizeof(unsigned int), NULL, &ret);
-	cl_mem outputCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-	size* sizeof(float), NULL, &ret);
-
-	float * hostMapSrc = (float *)clEnqueueMapBuffer(env.commandQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
-	int * hostMapStart = (int *)clEnqueueMapBuffer(env.commandQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-	int * hostMapEnd   = (int *)clEnqueueMapBuffer(env.commandQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-	for(int i=0;i<size;i++)
-	{
-		hostMapStart[i] = start[i];
-		hostMapEnd[i]	= end[i];
-	}
-	for(int i=0;i<alignSize;i++)
-		hostMapSrc[i] = (float)srcData[i];
-	clEnqueueUnmapMemObject(env.commandQueue,inputCl,hostMapSrc,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,startCl,hostMapStart,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,endCl,hostMapEnd,0,NULL,NULL);
-
-	clStatus = clSetKernelArg(env.kernel, 0, sizeof(cl_mem),
-		(void *)&inputCl);
-	clStatus = clSetKernelArg(env.kernel, 1, sizeof(cl_mem),
-		(void *)&startCl);
-	clStatus = clSetKernelArg(env.kernel, 2, sizeof(cl_mem),
-		(void *)&endCl);
-	clStatus = clSetKernelArg(env.kernel, 3, sizeof(cl_mem),
-		(void *)&outputCl);
-	CHECK_OPENCL(clStatus);
-
-	global_work_size[0] = size;
-	clStatus = clEnqueueNDRangeKernel(env.commandQueue, env.kernel, 1,
-		NULL, global_work_size, NULL, 0, NULL, NULL);
-	CHECK_OPENCL(clStatus)
-
-	float * outPutMap = (float *)clEnqueueMapBuffer(env.commandQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
-	for(int i=0;i<size;i++)
-		output[i]=outPutMap[i];
-
-	clEnqueueUnmapMemObject(env.commandQueue,outputCl,outPutMap,0,NULL,NULL);
-	clStatus = clFinish(env.commandQueue);
-
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseKernel(env.kernel);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(inputCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(startCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(endCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(outputCl);
-	CHECK_OPENCL(clStatus);
-	return 0;
+#ifdef GPU_64BITS
+int OclCalc::OclHostFormulaMax(double *srcData,int *start,int *end,double *output,int size)
+{
+    KernelEnv kEnv;
+    const char *kernelName = "oclFormulaMax";
+    CheckKernelName(&kEnv,kernelName);
+    cl_int clStatus;
+    size_t global_work_size[1];
+    int alignSize = size + end[0]-start[0];
+
+    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+    cl_int ret=0;
+    cl_mem inputCl = clCreateBuffer(kEnv.mpkContext,(cl_mem_flags) (CL_MEM_READ_WRITE),
+        alignSize * sizeof(float), NULL, &clStatus);
+    cl_mem startCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+        size * sizeof(unsigned int), NULL, &ret);
+    cl_mem endCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+        size * sizeof(unsigned int), NULL, &ret);
+    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+    size* sizeof(float), NULL, &ret);
+
+    float * hostMapSrc = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
+    int * hostMapStart = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
+    int * hostMapEnd   = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
+    for(int i=0;i<size;i++)
+    {
+        hostMapStart[i] = start[i];
+        hostMapEnd[i]    = end[i];
+    }
+    for(int i=0;i<alignSize;i++)
+        hostMapSrc[i] = (float)srcData[i];
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,inputCl,hostMapSrc,0,NULL,NULL);
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,startCl,hostMapStart,0,NULL,NULL);
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,endCl,hostMapEnd,0,NULL,NULL);
+
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
+        (void *)&inputCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
+        (void *)&startCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
+        (void *)&endCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
+        (void *)&outputCl);
+    CHECK_OPENCL(clStatus);
+
+    global_work_size[0] = size;
+    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+        NULL, global_work_size, NULL, 0, NULL, NULL);
+    CHECK_OPENCL(clStatus);
+
+    float * outPutMap = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
+    for(int i=0;i<size;i++)
+        output[i]=outPutMap[i];
+
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,outPutMap,0,NULL,NULL);
+    clStatus = clFinish(kEnv.mpkCmdQueue);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseKernel(kEnv.mpkKernel);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(inputCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(startCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(endCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(outputCl);
+    CHECK_OPENCL(clStatus);
+    return 0;
 }
-int OclCalc::OclHostFormulaMin(double *srcData,int *start,int *end,double *output,int size) {
-	KernelEnv env;
-	const char *kernelName = "oclFormulaMin";
-	CheckKernelName(&env,kernelName);
-
-	cl_int clStatus;
-	size_t global_work_size[1];
-	int alignSize = size + end[0]-start[0];
-
-	env.kernel = clCreateKernel(env.program,kernelName, &clStatus);
-	cl_int ret=0;
-	cl_mem inputCl = clCreateBuffer(env.context,(cl_mem_flags) (CL_MEM_READ_WRITE),
-		alignSize * sizeof(float), NULL, &clStatus);
-	cl_mem startCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-		size * sizeof(unsigned int), NULL, &ret);
-	cl_mem endCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-		size * sizeof(unsigned int), NULL, &ret);
-	cl_mem outputCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-	size* sizeof(float), NULL, &ret);
-
-	float * hostMapSrc = (float *)clEnqueueMapBuffer(env.commandQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
-	int * hostMapStart = (int *)clEnqueueMapBuffer(env.commandQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-	int * hostMapEnd   = (int *)clEnqueueMapBuffer(env.commandQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-	for(int i=0;i<size;i++)
-	{
-		hostMapStart[i] = start[i];
-		hostMapEnd[i]	= end[i];
-	}
-	for(int i=0;i<alignSize;i++)
-		hostMapSrc[i] = (float)srcData[i];
-	clEnqueueUnmapMemObject(env.commandQueue,inputCl,hostMapSrc,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,startCl,hostMapStart,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,endCl,hostMapEnd,0,NULL,NULL);
-
-	clStatus = clSetKernelArg(env.kernel, 0, sizeof(cl_mem),
-		(void *)&inputCl);
-	clStatus = clSetKernelArg(env.kernel, 1, sizeof(cl_mem),
-		(void *)&startCl);
-	clStatus = clSetKernelArg(env.kernel, 2, sizeof(cl_mem),
-		(void *)&endCl);
-	clStatus = clSetKernelArg(env.kernel, 3, sizeof(cl_mem),
-		(void *)&outputCl);
-	CHECK_OPENCL(clStatus);
-
-	global_work_size[0] = size;
-	clStatus = clEnqueueNDRangeKernel(env.commandQueue, env.kernel, 1,
-		NULL, global_work_size, NULL, 0, NULL, NULL);
-	CHECK_OPENCL(clStatus)
-
-	float * outPutMap = (float *)clEnqueueMapBuffer(env.commandQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
-	for(int i=0;i<size;i++)
-		output[i]=outPutMap[i];
-
-	clEnqueueUnmapMemObject(env.commandQueue,outputCl,outPutMap,0,NULL,NULL);
-	clStatus = clFinish(env.commandQueue);
-
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseKernel(env.kernel);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(inputCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(startCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(endCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(outputCl);
-	CHECK_OPENCL(clStatus);
-	return 0;
+int OclCalc::OclHostFormulaMin(double *srcData,int *start,int *end,double *output,int size)
+{
+    KernelEnv kEnv;
+    const char *kernelName = "oclFormulaMin";
+    CheckKernelName(&kEnv,kernelName);
+    cl_int clStatus;
+    size_t global_work_size[1];
+    int alignSize = size + end[0]-start[0];
+
+    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+    cl_int ret=0;
+    cl_mem inputCl = clCreateBuffer(kEnv.mpkContext,(cl_mem_flags) (CL_MEM_READ_WRITE),
+        alignSize * sizeof(float), NULL, &clStatus);
+    cl_mem startCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+        size * sizeof(unsigned int), NULL, &ret);
+    cl_mem endCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+        size * sizeof(unsigned int), NULL, &ret);
+    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+    size* sizeof(float), NULL, &ret);
+
+    float * hostMapSrc = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
+    int * hostMapStart = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
+    int * hostMapEnd   = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
+    for(int i=0;i<size;i++)
+    {
+        hostMapStart[i] = start[i];
+        hostMapEnd[i]    = end[i];
+    }
+    for(int i=0;i<alignSize;i++)
+        hostMapSrc[i] = (float)srcData[i];
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,inputCl,hostMapSrc,0,NULL,NULL);
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,startCl,hostMapStart,0,NULL,NULL);
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,endCl,hostMapEnd,0,NULL,NULL);
+
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 0, sizeof(cl_mem),
+        (void *)&inputCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem),
+        (void *)&startCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_mem),
+        (void *)&endCl);
+    clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_mem),
+        (void *)&outputCl);
+    CHECK_OPENCL(clStatus);
+
+    global_work_size[0] = size;
+    clStatus = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, kEnv.mpkKernel, 1,
+        NULL, global_work_size, NULL, 0, NULL, NULL);
+    CHECK_OPENCL(clStatus);
+
+    float * outPutMap = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
+    for(int i=0;i<size;i++)
+        output[i]=outPutMap[i];
+
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,outPutMap,0,NULL,NULL);
+    clStatus = clFinish(kEnv.mpkCmdQueue);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseKernel(kEnv.mpkKernel);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(inputCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(startCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(endCl);
+    CHECK_OPENCL(clStatus);
+    clStatus = clReleaseMemObject(outputCl);
+    CHECK_OPENCL(clStatus);
+    return 0;
 }
-int OclCalc::OclHostFormulaAverage(double *srcData,int *start,int *end,double *output,int size) {
-	KernelEnv env;
-	const char *kernelName = "oclFormulaAverage";
-	CheckKernelName(&env,kernelName);
-
-	cl_int clStatus;
-	size_t global_work_size[1];
-	int alignSize = size + end[0]-start[0];
-
-	env.kernel = clCreateKernel(env.program, kernelName, &clStatus);
-	cl_int ret=0;
-	cl_mem inputCl = clCreateBuffer(env.context,(cl_mem_flags) (CL_MEM_READ_WRITE),
-		alignSize * sizeof(float), NULL, &clStatus);
-	cl_mem startCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-		size * sizeof(unsigned int), NULL, &ret);
-	cl_mem endCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-		size * sizeof(unsigned int), NULL, &ret);
-	cl_mem outputCl = clCreateBuffer(env.context, (cl_mem_flags) (CL_MEM_READ_WRITE),
-	size* sizeof(float), NULL, &ret);
-
-	float * hostMapSrc = (float *)clEnqueueMapBuffer(env.commandQueue,inputCl,CL_TRUE,CL_MAP_WRITE,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
-	int * hostMapStart = (int *)clEnqueueMapBuffer(env.commandQueue,startCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-	int * hostMapEnd   = (int *)clEnqueueMapBuffer(env.commandQueue,endCl,CL_TRUE,CL_MAP_WRITE,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
-	for(int i=0;i<size;i++)
-	{
-		hostMapStart[i] = start[i];
-		hostMapEnd[i]	= end[i];
-	}
-	for(int i=0;i<alignSize;i++)
-		hostMapSrc[i] = (float)srcData[i];
-	clEnqueueUnmapMemObject(env.commandQueue,inputCl,hostMapSrc,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,startCl,hostMapStart,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,endCl,hostMapEnd,0,NULL,NULL);
-
-	clStatus = clSetKernelArg(env.kernel, 0, sizeof(cl_mem),
-		(void *)&inputCl);
-	clStatus = clSetKernelArg(env.kernel, 1, sizeof(cl_mem),
-		(void *)&startCl);
-	clStatus = clSetKernelArg(env.kernel, 2, sizeof(cl_mem),
-		(void *)&endCl);
-	clStatus = clSetKernelArg(env.kernel, 3, sizeof(cl_mem),
-		(void *)&outputCl);
-	CHECK_OPENCL(clStatus);
-
-	global_work_size[0] = size;
-	clStatus = clEnqueueNDRangeKernel(env.commandQueue, env.kernel, 1,
-		NULL, global_work_size, NULL, 0, NULL, NULL);
-	CHECK_OPENCL(clStatus)
-
-	float * outPutMap = (float *)clEnqueueMapBuffer(env.commandQueue,outputCl,CL_TRUE,CL_MAP_READ,0,size*sizeof(float),0,NULL,NULL,NULL);
-	for(int i=0;i<size;i++)
-		output[i]=outPutMap[i];
-
-	clEnqueueUnmapMemObject(env.commandQueue,outputCl,outPutMap,0,NULL,NULL);
-	clStatus = clFinish(env.commandQueue);
-
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseKernel(env.kernel);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(inputCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(startCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(endCl);
-	CHECK_OPENCL(clStatus);
-	clStatus = clReleaseMemObject(outputCl);
-	CHECK_OPENCL(clStatus);
-	return 0;
+int OclCalc::OclHostFormulaAverage(double *srcData,int *start,int *end,double *output,int size)
+{
+    KernelEnv kEnv;
+    const char *kernelName = "oclFormulaAverage";
+    CheckKernelName(&kEnv,kernelName);
+    cl_int clStatus;
+    size_t global_work_size[1];
+    int alignSize = size + end[0]-start[0];
+
+    kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus);
+    cl_int ret=0;
+    cl_mem inputCl    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                                     alignSize * sizeof(float), NULL, &clStatus);
+    cl_mem startCl    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                                     size * sizeof(unsigned int), NULL, &ret);
+    cl_mem endCl    = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+                                     size * sizeof(unsigned int), NULL, &ret);
+    cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
+                                     size* sizeof(float), NULL, &ret);
+
+    float * hostMapSrc = (float *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,inputCl,CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION,0,alignSize * sizeof(float),0,NULL,NULL,NULL);
+    int * hostMapStart = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,startCl,CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
+    int * hostMapEnd   = (int *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,endCl,CL_TRUE,CL_MAP_WRITE_INVALIDATE_REGION,0,size * sizeof(unsigned int),0,NULL,NULL,NULL);
+#if 1
+    for(int i=0;i<size;i++)
+    {
+        hostMapStart[i] = start[i];
+        hostMapEnd[i]    = end[i];
+    }
+    for(int i=0;i<alignSize;i++)
+        hostMapSrc[i] = (float)srcData[i];
+    //memcpy(hostMapSrc,srcData,alignSize * sizeof(float));
+#endif
+    for(sal_Int32 i = 0; i < alignSize; ++i){//dbg
+                       fprintf(stderr,"In avg host,hostMapSrc[%d] is ...%f\n",i,hostMapSrc[i]);
+                   }
 
 
-}
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,inputCl,hostMapSrc,  0,NULL,NULL);
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,startCl,hostMapStart,0,NULL,NULL);
+    clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,endCl,  hostMapEnd,  0,NULL,NULL);
 
 
-int OclCalc::OclHostSignedAdd(double *lData,double *rData,double *rResult,int dSize) {
 
-	KernelEnv env;
-	int status;
-	const char *kernelName = "oclSignedAdd";
-	CheckKernelName(&env,kernelName);
-
-
-	cl_int clStatus;
-	size_t global_work_size[1];
-
-	env.kernel = clCreateKernel(env.program,kernelName, &clStatus);
-	cl_mem clLiftData = clCreateBuffer(env.context,
-		(cl_mem_flags) (CL_MEM_READ_WRITE),
-		dSize * sizeof(float), NULL, &clStatus);
-	cl_mem clRightData = clCreateBuffer(env.context,
-		(cl_mem_flags) (CL_MEM_READ_WRITE),
-		dSize * sizeof(float), NULL, &clStatus);
-	cl_mem clResult = clCreateBuffer(env.context,
-		(cl_mem_flags) (CL_MEM_READ_WRITE),
-		dSize * sizeof(float), NULL, &clStatus);
-
-	float * hostMapLeftData 	= (float *)clEnqueueMapBuffer(env.commandQueue,clLiftData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-	float * hostMapRightData 	= (float *)clEnqueueMapBuffer(env.commandQueue,clRightData,CL_TRUE,CL_MAP_WRITE,0,dSize*sizeof(float),0,NULL,NULL,NULL);
-	for(int i=0;i<dSize;i++)
-	{
-		hostMapLeftData[i] 	= (float)lData[i];
-		hostMapRightData[i] = (float)rData[i];
-	}
-	clEnqueueUnmapMemObject(env.commandQueue,clLiftData,hostMapLeftData,0,NULL,NULL);
-	clEnqueueUnmapMemObject(env.commandQueue,clRightData,hostMapRightData,0,NULL,NULL);
-
-	status = clSetKernelArg(env.kernel, 0, sizeof(cl_mem),
-		(void *)&clLiftData);
-	status = clSetKernelArg(env.kernel, 1, sizeof(cl_mem),
-		(void *)&clRightData);
-	status = clSetKernelArg(env.kernel, 2, sizeof(cl_mem),
-		(void *)&clResult);
-	CHECK_OPENCL(status)

... etc. - the rest is truncated


More information about the Libreoffice-commits mailing list