[Libreoffice-commits] core.git: Branch 'feature/calc-group-interpreter-4' - 2 commits - sc/source

Fri Nov 15 16:00:16 PST 2013

sc/source/core/opencl/formulagroupcl.cxx |  429 +++++++++++++++++++++++++------
 sc/source/core/opencl/op_math.cxx        |  225 +++++++++++++---
 sc/source/core/opencl/op_math.hxx        |    4 
 sc/source/core/opencl/opbase.cxx         |   43 +++
 sc/source/core/opencl/opbase.hxx         |    8 
 5 files changed, 590 insertions(+), 119 deletions(-)

New commits:
commit 483da7cdb5082821541b1897ad81b8ddf55ff1a7
Author: Wei Wei <weiwei at multicorewareinc.com>
Date:   Fri Nov 15 17:33:19 2013 -0600

    GPU Calc: implemented parallel reduction for SUMIFS
    
    For now only works for fixed and sliding fixed-sized windows.
    
    Change-Id: I25e3f893a86d0e1723ae1e1633ffeeee93926b8d
    
    Signed-off-by: I-Jui (Ray) Sung <ray at multicorewareinc.com>

diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 3b19886..427dd9e 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -419,42 +419,51 @@ public:
         bIsStartFixed = mpDVR->IsStartFixed();
         bIsEndFixed = mpDVR->IsEndFixed();
     }
+    virtual bool NeedParallelReduction(void) const
+    {
+        return GetWindowSize()> 100 &&
+            ( (GetStartFixed() && GetEndFixed()) ||
+              (!GetStartFixed() && !GetEndFixed())  ) ;
+    }
     virtual void GenSlidingWindowFunction(std::stringstream &ss) {
-        std::string name = Base::GetName();
-        ss << "__kernel void "<<name;
-        ss << "_reduction(__global double* A, "
-            "__global double *result,int arrayLength,int windowSize){\n";
-        ss << "    double tmp, current_result = 0.0;\n";
-        ss << "    int writePos = get_group_id(1);\n";
-        ss << "    int offset = get_group_id(1);\n";
-        ss << "    int lidx = get_local_id(0);\n";
-        ss << "    __local double shm_buf[256];\n";
-        ss << "    if (arrayLength == windowSize)\n";
-        ss << "        offset = 0;\n";
-        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-        ss << "    int loop = arrayLength/512 + 1;\n";
-        ss << "    for (int l=0; l<loop; l++){\n";
-        ss << "    tmp = 0.0;\n";
-        ss << "    int loopOffset = l*512;\n";
-        ss << "    if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
-        ss << "        tmp = A[loopOffset + lidx + offset] + "
-            "A[loopOffset + lidx + offset + 256];\n";
-        ss << "    else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
-        ss << "        tmp = A[loopOffset + lidx + offset];\n";
-        ss << "    shm_buf[lidx] = tmp;\n";
-        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-        ss << "    for (int i = 128; i >0; i/=2) {\n";
-        ss << "        if (lidx < i)\n";
-        ss << "            shm_buf[lidx] += shm_buf[lidx + i];\n";
-        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
-        ss << "    }\n";
-        ss << "    if (lidx == 0)\n";
-        ss << "        current_result += shm_buf[0];\n";
-        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-        ss << "    }\n";
-        ss << "    if (lidx == 0)\n";
-        ss << "        result[writePos] = current_result;\n";
-        ss << "}\n";
+        if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+        {
+            std::string name = Base::GetName();
+            ss << "__kernel void "<<name;
+            ss << "_reduction(__global double* A, "
+                "__global double *result,int arrayLength,int windowSize){\n";
+            ss << "    double tmp, current_result = 0.0;\n";
+            ss << "    int writePos = get_group_id(1);\n";
+            ss << "    int offset = get_group_id(1);\n";
+            ss << "    int lidx = get_local_id(0);\n";
+            ss << "    __local double shm_buf[256];\n";
+            ss << "    if (arrayLength == windowSize)\n";
+            ss << "        offset = 0;\n";
+            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+            ss << "    int loop = arrayLength/512 + 1;\n";
+            ss << "    for (int l=0; l<loop; l++){\n";
+            ss << "    tmp = 0.0;\n";
+            ss << "    int loopOffset = l*512;\n";
+            ss << "    if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
+            ss << "        tmp = A[loopOffset + lidx + offset] + "
+                "A[loopOffset + lidx + offset + 256];\n";
+            ss << "    else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
+            ss << "        tmp = A[loopOffset + lidx + offset];\n";
+            ss << "    shm_buf[lidx] = tmp;\n";
+            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+            ss << "    for (int i = 128; i >0; i/=2) {\n";
+            ss << "        if (lidx < i)\n";
+            ss << "            shm_buf[lidx] += shm_buf[lidx + i];\n";
+            ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+            ss << "    }\n";
+            ss << "    if (lidx == 0)\n";
+            ss << "        current_result += shm_buf[0];\n";
+            ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+            ss << "    }\n";
+            ss << "    if (lidx == 0)\n";
+            ss << "        result[writePos] = current_result;\n";
+            ss << "}\n";
+        }
      }
 
 
@@ -573,11 +582,16 @@ public:
         if (CL_SUCCESS != err)
             throw OpenCLError(err);
         // reproduce the reduction function name
-        std::string kernelName = Base::GetName() + "_reduction";
+        std::string kernelName;
+        if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+            kernelName = Base::GetName() + "_reduction";
+        else throw Unhandled();
+
         cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
         if (err != CL_SUCCESS)
             throw OpenCLError(err);
         // set kernel arg of reduction kernel
+        // TODO(Wei Wei): use unique name for kernel
         err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
                 (void *)&(Base::mpClmem));
         if (CL_SUCCESS != err)
@@ -621,6 +635,14 @@ public:
         }
     }
 
+    size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); }
+
+    size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); }
+
+    size_t GetStartFixed(void) const {return bIsStartFixed; }
+
+    size_t GetEndFixed(void) const {return bIsEndFixed; }
+
 protected:
     bool bIsStartFixed, bIsEndFixed;
     const formula::DoubleVectorRefToken *mpDVR;
@@ -1001,6 +1023,75 @@ public:
         {
             i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
         }
+        if (OpSumIfs *OpSumCodeGen = dynamic_cast<OpSumIfs*>(mpCodeGen.get()))
+        {
+            assert(mpClmem == NULL);
+            // Obtain cl context
+            KernelEnv kEnv;
+            OpenclDevice::setKernelEnv(&kEnv);
+            cl_int err;
+            DynamicKernelSlidingArgument<DynamicKernelArgument> *slidingArgPtr =
+                dynamic_cast< DynamicKernelSlidingArgument<DynamicKernelArgument> *>
+                (mvSubArguments[0].get());
+            cl_mem mpClmem2;
+
+            if (OpSumCodeGen->NeedReductionKernel())
+            {
+                assert(slidingArgPtr);
+                size_t nInput = slidingArgPtr -> GetArrayLength();
+                size_t nCurWindowSize = slidingArgPtr -> GetWindowSize();
+                std::vector<cl_mem> vclmem;
+
+                for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
+                ++it)
+                {
+                    vclmem.push_back((*it)->GetCLBuffer());
+                }
+                mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE,
+                        sizeof(double)*nVectorWidth, NULL, &err);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+
+                std::string kernelName = "SumIfs_reduction";
+                cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err);
+                if (err != CL_SUCCESS)
+                    throw OpenCLError(err);
+
+                    // set kernel arg of reduction kernel
+                for (size_t j=0; j< vclmem.size(); j++){
+                    err = clSetKernelArg(redKernel, j, sizeof(cl_mem),
+                            (void *)&vclmem[j]);
+                    if (CL_SUCCESS != err)
+                        throw OpenCLError(err);
+                }
+                err = clSetKernelArg(redKernel, vclmem.size(), sizeof(cl_mem), (void *)&mpClmem2);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+
+                err = clSetKernelArg(redKernel, vclmem.size()+1, sizeof(cl_int), (void*)&nInput);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+
+                err = clSetKernelArg(redKernel, vclmem.size()+2, sizeof(cl_int), (void*)&nCurWindowSize);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+                // set work group size and execute
+                size_t global_work_size[] = {256, (size_t)nVectorWidth };
+                size_t local_work_size[] = {256, 1};
+                err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
+                        global_work_size, local_work_size, 0, NULL, NULL);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+                err = clFinish(kEnv.mpkCmdQueue);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+
+                 // Pass mpClmem2 to the "real" kernel
+                err = clSetKernelArg(k, argno, sizeof(cl_mem), (void *)&mpClmem2);
+                if (CL_SUCCESS != err)
+                    throw OpenCLError(err);
+            }
+        }
         return i;
     }
 
diff --git a/sc/source/core/opencl/op_math.cxx b/sc/source/core/opencl/op_math.cxx
index aac8661..fb38601 100644
--- a/sc/source/core/opencl/op_math.cxx
+++ b/sc/source/core/opencl/op_math.cxx
@@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
     size_t nCurWindowSize = pCurDVR->GetArrayLength() <
     pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength():
     pCurDVR->GetRefRowSize() ;
+
+    mNeedReductionKernel = vSubArguments[0]->NeedParallelReduction();
+    if (mNeedReductionKernel)
+    {
+        // generate reduction functions
+        ss << "__kernel void ";
+        ss << "SumIfs_reduction(  ";
+        for (unsigned i = 0; i < vSubArguments.size(); i++)
+        {
+            if (i)
+                ss << ",";
+            vSubArguments[i]->GenSlidingWindowDecl(ss);
+        }
+        ss << ", __global double *result,int arrayLength,int windowSize";
+
+        ss << ")\n{\n";
+        ss << "    double tmp =0;\n";
+        ss << "    int i ;\n";
+
+        GenTmpVariables(ss,vSubArguments);
+        ss << "    double current_result = 0.0;\n";
+        ss << "    int writePos = get_group_id(1);\n";
+        if (pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed())
+            ss << "    int offset = 0;\n";
+        else if (!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
+            ss << "    int offset = get_group_id(1);\n";
+        else
+            throw Unhandled();
+        // actually unreachable
+        ss << "    int lidx = get_local_id(0);\n";
+        ss << "    __local double shm_buf[256];\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    int loop = arrayLength/512 + 1;\n";
+        ss << "    for (int l=0; l<loop; l++){\n";
+        ss << "        tmp = 0.0;\n";
+        ss << "        int loopOffset = l*512;\n";
+
+        ss << "        int p1 = loopOffset + lidx + offset, p2 = p1 + 256;\n";
+        ss << "        if (p2 < min(offset + windowSize, arrayLength)) {\n";
+        ss << "            tmp0 = 0.0;\n";
+        int mm=0;
+        std::string p1 = "p1";
+        std::string p2 = "p2";
+        for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+        {
+            CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
+            CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
+            ss << "";
+            ss <<"    if(isequal(";
+            ss <<"tmp";
+            ss <<j;
+            ss <<" , ";
+            ss << "tmp";
+            ss << j+1;
+            ss << "))";
+            ss << "{\n";
+        }
+        CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
+        ss << "    tmp += tmp0;\n";
+        for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
+        {
+            for(int n = 0;n<mm+1;n++)
+            {
+                ss << "    ";
+            }
+            ss<< "}\n\n";
+        }
+        mm=0;
+        for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+        {
+            CheckSubArgumentIsNan2(ss,vSubArguments,j,p2);
+            CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p2);
+            ss <<"    if(isequal(";
+            ss <<"tmp";
+            ss <<j;
+            ss <<" , ";
+            ss << "tmp";
+            ss << j+1;
+            ss << ")){\n";
+        }
+        CheckSubArgumentIsNan2(ss,vSubArguments,0,p2);
+        ss << "    tmp += tmp0;\n";
+        for(unsigned j=1;j< vSubArguments.size();j+=2,mm--)
+        {
+            for(int n = 0;n<mm+1;n++)
+            {
+                ss << "    ";
+            }
+            ss<< "}\n";
+        }
+        ss << "    }\n";
+
+        ss << "    else if (p1 < min(arrayLength, offset + windowSize)) {\n";
+        mm=0;
+        for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+        {
+            CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
+            CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
+
+            ss <<"    if(isequal(";
+            ss <<"tmp";
+            ss <<j;
+            ss <<" , ";
+            ss << "tmp";
+            ss << j+1;
+            ss << ")){\n";
+        }
+        CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
+        ss << "    tmp += tmp0;\n";
+        for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
+        {
+            for(int n = 0;n<mm+1;n++)
+            {
+                ss << "    ";
+            }
+            ss<< "}\n\n";
+        }
+
+        ss << "    }\n";
+        ss << "    shm_buf[lidx] = tmp;\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    for (int i = 128; i >0; i/=2) {\n";
+        ss << "        if (lidx < i)\n";
+        ss << "            shm_buf[lidx] += shm_buf[lidx + i];\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "    if (lidx == 0)\n";
+        ss << "        current_result += shm_buf[0];\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+
+        ss << "    if (lidx == 0)\n";
+        ss << "        result[writePos] = current_result;\n";
+        ss << "}\n";
+    }// finish generate reduction code
+    // generate functions as usual
     ss << "\ndouble " << sSymName;
     ss << "_"<< BinFuncName() <<"(";
     for (unsigned i = 0; i < vSubArguments.size(); i++)
@@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
     ss << ")\n    {\n";
     ss <<"    int gid0=get_global_id(0);\n";
     ss << "    double tmp =0;\n";
-    ss << "    int i ;\n";
-    GenTmpVariables(ss,vSubArguments);
-    ss << "    for (i = ";
-     if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
-        ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
-     } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
-        ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
-     } else {
-        ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
-     }
-     ss << "    {\n";
-     if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
-     {
-        ss<< "    int doubleIndex =i+gid0;\n";
-     }else
-     {
-        ss<< "    int doubleIndex =i;\n";
-     }
-     ss<< "    int singleIndex =gid0;\n";
-     int m=0;
-     for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
-     {
-        CheckSubArgumentIsNan(ss,vSubArguments,j);
-        CheckSubArgumentIsNan(ss,vSubArguments,j+1);
-        ss <<"    if(isequal(";
-        ss <<"tmp";
-        ss <<j;
-        ss <<" , ";
-        ss << "tmp";
-        ss << j+1;
-        ss << ")){\n";
-     }
-     CheckSubArgumentIsNan(ss,vSubArguments,0);
-    ss << "    tmp += tmp0;\n";
-    for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
-     {
-         for(int n = 0;n<m+1;n++)
+    if (!mNeedReductionKernel)
+    {
+        ss << "    int i ;\n";
+        GenTmpVariables(ss,vSubArguments);
+        ss << "    for (i = ";
+        if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
+            ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
+        } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
+            ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
+        } else {
+            ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
+        }
+        ss << "    {\n";
+        if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
         {
-            ss << "    ";
+            ss<< "    int doubleIndex =i+gid0;\n";
+        }else
+        {
+            ss<< "    int doubleIndex =i;\n";
         }
-        ss<< "}\n";
-     }
+        ss<< "    int singleIndex =gid0;\n";
+        int m=0;
+        for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
+        {
+            CheckSubArgumentIsNan(ss,vSubArguments,j);
+            CheckSubArgumentIsNan(ss,vSubArguments,j+1);
+            ss <<"    if(isequal(";
+            ss <<"tmp";
+            ss <<j;
+            ss <<" , ";
+            ss << "tmp";
+            ss << j+1;
+            ss << ")){\n";
+        }
+        CheckSubArgumentIsNan(ss,vSubArguments,0);
+        ss << "    tmp += tmp0;\n";
+        for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
+        {
+            for(int n = 0;n<m+1;n++)
+            {
+                ss << "    ";
+            }
+            ss<< "}\n";
+        }
+    }
+    if (mNeedReductionKernel)
+    {
+        ss << "tmp =";
+        vSubArguments[0]->GenDeclRef(ss);
+        ss << "[gid0];\n";
+    }
     ss << "return tmp;\n";
     ss << "}";
 }
diff --git a/sc/source/core/opencl/op_math.hxx b/sc/source/core/opencl/op_math.hxx
index 01cbc82..7081b00 100644
--- a/sc/source/core/opencl/op_math.hxx
+++ b/sc/source/core/opencl/op_math.hxx
@@ -33,9 +33,13 @@ public:
 class OpSumIfs: public CheckVariables
 {
 public:
+    OpSumIfs(void): CheckVariables(), mNeedReductionKernel(false) {}
     virtual void GenSlidingWindowFunction(std::stringstream &ss,
             const std::string sSymName, SubArguments &vSubArguments);
     virtual std::string BinFuncName(void) const { return "SumIfs"; }
+    bool NeedReductionKernel(void) const { return mNeedReductionKernel; }
+protected:
+    bool mNeedReductionKernel;
 };
 class OpCosh: public Normal
 {
diff --git a/sc/source/core/opencl/opbase.cxx b/sc/source/core/opencl/opbase.cxx
index 6f3c339..fe7fc24 100644
--- a/sc/source/core/opencl/opbase.cxx
+++ b/sc/source/core/opencl/opbase.cxx
@@ -157,6 +157,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss,
     ss << vSubArguments[i]->GenSlidingWindowDeclRef();
     ss<<";\n";
 }
+
+void CheckVariables::CheckSubArgumentIsNan2( std::stringstream & ss,
+    SubArguments &vSubArguments,  int argumentNum, std::string p)
+{
+    int i = argumentNum;
+    if(vSubArguments[i]->GetFormulaToken()->GetType() == formula::svDouble)
+    {
+        ss <<"    tmp";
+        ss <<i;
+        ss << "=";
+        vSubArguments[i]->GenDeclRef(ss);
+        ss<<";\n";
+        return;
+    }
+
+#ifdef ISNAN
+    ss<< "    tmp";
+    ss<< i;
+    ss<< "= fsum(";
+    vSubArguments[i]->GenDeclRef(ss);
+    if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+     formula::svDoubleVectorRef)
+        ss<<"["<< p.c_str()<< "]";
+    else  if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+     formula::svSingleVectorRef)
+        ss<<"[get_group_id(1)]";
+    ss<<", 0);\n";
+    return;
+#endif
+    ss <<"    tmp";
+    ss <<i;
+    ss << "=";
+    vSubArguments[i]->GenDeclRef(ss);
+    if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+            formula::svDoubleVectorRef)
+        ss<<"["<< p.c_str()<< "]";
+    else  if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+            formula::svSingleVectorRef)
+        ss<<"[get_group_id(1)]";
+
+    ss<<";\n";
+}
+
 void CheckVariables::CheckAllSubArgumentIsNan(
     std::stringstream & ss, SubArguments & vSubArguments)
 {
diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx
index 41e4587..6b475df 100644
--- a/sc/source/core/opencl/opbase.hxx
+++ b/sc/source/core/opencl/opbase.hxx
@@ -104,6 +104,9 @@ public:
     virtual void DumpInlineFun(std::set<std::string>& ,
         std::set<std::string>& ) const {}
     const std::string& GetName(void) const { return mSymName; }
+    cl_mem GetCLBuffer(void) const {return mpClmem; }
+    virtual bool NeedParallelReduction(void) const { return false; }
+
 protected:
     const std::string mSymName;
     FormulaTreeNodeRef mFormulaTree;
@@ -157,6 +160,9 @@ public:
             SubArguments &vSubArguments, int argumentNum);
     void CheckAllSubArgumentIsNan(std::stringstream &ss,
             SubArguments &vSubArguments);
+    // only check isNan
+    void CheckSubArgumentIsNan2(std::stringstream &ss,
+            SubArguments &vSubArguments, int argumentNum, std::string p);
 };
 
 }}
commit 4a9c141f5e6250bc4a9cde870b1649a1c3faccae
Author: Wei Wei <weiwei at multicorewareinc.com>
Date:   Fri Nov 15 16:37:10 2013 -0600

    GPU Calc: use parallel reduction to implement sum
    
    Use reduction kernel when given a large DoubleVectorRef
    
    Change-Id: Ifd4977b81be64274733909e43f0e5ef161bb455e
    
    Signed-off-by: I-Jui (Ray) Sung <ray at multicorewareinc.com>

diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 6d442fc..3b19886 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -44,7 +44,7 @@ namespace sc { namespace opencl {
 
 
 /// Map the buffer used by an argument and do necessary argument setting
-size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int)
+size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int, cl_program)
 {
     FormulaToken *ref = mFormulaTree->GetFormulaToken();
     assert(mpClmem == NULL);
@@ -125,7 +125,7 @@ public:
         return 1;
     }
     /// Pass the 32-bit hash of the string to the kernel
-    virtual size_t Marshal(cl_kernel k, int argno, int)
+    virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
     {
         FormulaToken *ref = mFormulaTree->GetFormulaToken();
         assert(mpClmem == NULL);
@@ -183,7 +183,7 @@ public:
         return 1;
     }
     /// Create buffer and pass the buffer to a given kernel
-    virtual size_t Marshal(cl_kernel k, int argno, int)
+    virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
     {
         double tmp = 0.0;
         // Pass the scalar result back to the rest of the formula kernel
@@ -222,7 +222,7 @@ public:
         return 1;
     }
     /// Create buffer and pass the buffer to a given kernel
-    virtual size_t Marshal(cl_kernel k, int argno, int)
+    virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
     {
         double tmp = 0.0;
         // Pass the scalar result back to the rest of the formula kernel
@@ -264,7 +264,7 @@ public:
         return 1;
     }
     /// Create buffer and pass the buffer to a given kernel
-    virtual size_t Marshal(cl_kernel k, int argno, int)
+    virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
     {
         double tmp = 0.0;
         // Pass the scalar result back to the rest of the formula kernel
@@ -292,11 +292,11 @@ public:
     {
         DynamicKernelStringArgument::GenDecl(ss);
     }
-    virtual size_t Marshal(cl_kernel, int, int);
+    virtual size_t Marshal(cl_kernel, int, int, cl_program);
 };
 
 /// Marshal a string vector reference
-size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int)
+size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int, cl_program)
 {
     FormulaToken *ref = mFormulaTree->GetFormulaToken();
     assert(mpClmem == NULL);
@@ -390,10 +390,10 @@ public:
         ss << ")";
         return ss.str();
     }
-    virtual size_t Marshal(cl_kernel k, int argno, int vw)
+    virtual size_t Marshal(cl_kernel k, int argno, int vw, cl_program p)
     {
-        int i = DynamicKernelArgument::Marshal(k, argno, vw);
-        i += mStringArgument.Marshal(k, argno+i, vw);
+        int i = DynamicKernelArgument::Marshal(k, argno, vw, p);
+        i += mStringArgument.Marshal(k, argno+i, vw, p);
         return i;
     }
 protected:
@@ -402,24 +402,61 @@ protected:
 
 /// Handling a Double Vector that is used as a sliding window input
 /// to either a sliding window average or sum-of-products
+class OpSum; // Forward Declaration
 template<class Base>
 class DynamicKernelSlidingArgument: public Base
 {
 public:
     DynamicKernelSlidingArgument(const std::string &s,
-        FormulaTreeNodeRef ft):
-        Base(s, ft)
+        FormulaTreeNodeRef ft, boost::shared_ptr<SlidingFunctionBase> &CodeGen):
+        Base(s, ft), mpCodeGen(CodeGen), needReductionKernel(true), mpClmem2(NULL)
     {
         FormulaToken *t = ft->GetFormulaToken();
         if (t->GetType() != formula::svDoubleVectorRef)
             throw Unhandled();
-        const formula::DoubleVectorRefToken* pDVR =
-            dynamic_cast<const formula::DoubleVectorRefToken *>(t);
-        assert(pDVR);
-        bIsStartFixed = pDVR->IsStartFixed();
-        bIsEndFixed = pDVR->IsEndFixed();
+        mpDVR = dynamic_cast<const formula::DoubleVectorRefToken *>(t);
+        assert(mpDVR);
+        bIsStartFixed = mpDVR->IsStartFixed();
+        bIsEndFixed = mpDVR->IsEndFixed();
     }
-    virtual void GenSlidingWindowFunction(std::stringstream &) {}
+    virtual void GenSlidingWindowFunction(std::stringstream &ss) {
+        std::string name = Base::GetName();
+        ss << "__kernel void "<<name;
+        ss << "_reduction(__global double* A, "
+            "__global double *result,int arrayLength,int windowSize){\n";
+        ss << "    double tmp, current_result = 0.0;\n";
+        ss << "    int writePos = get_group_id(1);\n";
+        ss << "    int offset = get_group_id(1);\n";
+        ss << "    int lidx = get_local_id(0);\n";
+        ss << "    __local double shm_buf[256];\n";
+        ss << "    if (arrayLength == windowSize)\n";
+        ss << "        offset = 0;\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    int loop = arrayLength/512 + 1;\n";
+        ss << "    for (int l=0; l<loop; l++){\n";
+        ss << "    tmp = 0.0;\n";
+        ss << "    int loopOffset = l*512;\n";
+        ss << "    if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
+        ss << "        tmp = A[loopOffset + lidx + offset] + "
+            "A[loopOffset + lidx + offset + 256];\n";
+        ss << "    else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
+        ss << "        tmp = A[loopOffset + lidx + offset];\n";
+        ss << "    shm_buf[lidx] = tmp;\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    for (int i = 128; i >0; i/=2) {\n";
+        ss << "        if (lidx < i)\n";
+        ss << "            shm_buf[lidx] += shm_buf[lidx + i];\n";
+        ss << "        barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "    if (lidx == 0)\n";
+        ss << "        current_result += shm_buf[0];\n";
+        ss << "    barrier(CLK_LOCAL_MEM_FENCE);\n";
+        ss << "    }\n";
+        ss << "    if (lidx == 0)\n";
+        ss << "        result[writePos] = current_result;\n";
+        ss << "}\n";
+     }
+
 
     virtual std::string GenSlidingWindowDeclRef(bool=false) const
     {
@@ -430,8 +467,168 @@ public:
             ss << Base::GetName() << "[i]";
         return ss.str();
     }
+    /// Controls how the elements in the DoubleVectorRef are traversed
+    virtual size_t GenLoop(std::stringstream &ss, bool &needBody)
+    {
+        assert(mpDVR);
+        size_t nCurWindowSize = mpDVR->GetRefRowSize();
+        if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+        {
+            if (!bIsStartFixed && !bIsEndFixed)
+            {
+                // set 100 as a threshold for invoking reduction kernel
+                if (nCurWindowSize > 100 )
+                {
+                    std::string temp = Base::GetName() + "[gid0]";
+                    ss << "tmp = ";
+                    ss << mpCodeGen->Gen2(temp, "tmp");
+                    ss << ";\n\t";
+                    needBody = false;
+                    needReductionKernel = false;
+                    return nCurWindowSize;
+                }
+            }
+
+            if (bIsStartFixed && bIsEndFixed)
+            {
+                // set 100 as a threshold for invoking reduction kernel
+                if (nCurWindowSize > 100 )
+                {
+                    std::string temp = Base::GetName() + "[0]";
+                    ss << "tmp = ";
+                    ss << mpCodeGen->Gen2(temp, "tmp");
+                    ss << ";\n\t";
+                    needBody = false;
+                    needReductionKernel = false;
+                    return nCurWindowSize;
+                }
+            }
+        }
+        needBody = true;
+        needReductionKernel = true;
+        ss << "for (int i = ";
+        if (!bIsStartFixed && bIsEndFixed)
+        {
+#ifdef  ISNAN
+            ss << "gid0; i < " << mpDVR->GetArrayLength();
+            ss << " && i < " << nCurWindowSize  << "; i++){\n\t\t";
+#else
+            ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+        }
+        else if (bIsStartFixed && !bIsEndFixed)
+        {
+#ifdef  ISNAN
+            ss << "0; i < " << mpDVR->GetArrayLength();
+            ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t";
+#else
+            ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+        }
+        else if (!bIsStartFixed && !bIsEndFixed)
+        {
+#ifdef  ISNAN
+            ss << "0; i + gid0 < " << mpDVR->GetArrayLength();
+            ss << " &&  i < "<< nCurWindowSize << "; i++){\n\t\t";
+#else
+            ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+        }
+        else
+        {
+#ifdef  ISNAN
+            ss << "0; i < "<< nCurWindowSize << "; i++){\n\t\t";
+#else
+            ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+        }
+
+        return nCurWindowSize;
+    }
+
+    virtual size_t Marshal(cl_kernel k, int argno, int w, cl_program mpProgram)
+    {
+        if (needReductionKernel)
+            return Base::Marshal(k, argno, w, mpProgram);
+
+        assert(Base::mpClmem == NULL);
+        // Obtain cl context
+        KernelEnv kEnv;
+        OpenclDevice::setKernelEnv(&kEnv);
+        cl_int err;
+        size_t nInput = mpDVR->GetArrayLength();
+        size_t nCurWindowSize = mpDVR->GetRefRowSize();
+        // create clmem buffer
+        if (mpDVR->GetArrays()[0].mpNumericArray == NULL)
+            throw Unhandled();
+        double *pHostBuffer = const_cast<double*>(
+                mpDVR->GetArrays()[0].mpNumericArray);
+        size_t szHostBuffer = nInput * sizeof(double);
+        Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
+                (cl_mem_flags) CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,
+                szHostBuffer,
+                pHostBuffer, &err);
+        mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_WRITE_ONLY,
+                sizeof(double)*w, NULL, NULL);
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+        // reproduce the reduction function name
+        std::string kernelName = Base::GetName() + "_reduction";
+        cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
+        if (err != CL_SUCCESS)
+            throw OpenCLError(err);
+        // set kernel arg of reduction kernel
+        err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
+                (void *)&(Base::mpClmem));
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+
+        err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), (void *)&mpClmem2);
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+
+        err = clSetKernelArg(redKernel, 2, sizeof(cl_int), (void*)&nInput);
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+
+        err = clSetKernelArg(redKernel, 3, sizeof(cl_int), (void*)&nCurWindowSize);
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+
+        // set work group size and execute
+        size_t global_work_size[] = {256, (size_t)w };
+        size_t local_work_size[] = {256, 1};
+        err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
+                global_work_size, local_work_size, 0, NULL, NULL);
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+        err = clFinish(kEnv.mpkCmdQueue);
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+
+        // set kernel arg
+        err = clSetKernelArg(k, argno, sizeof(cl_mem), (void*)&(mpClmem2));
+        if (CL_SUCCESS != err)
+            throw OpenCLError(err);
+        return 1;
+    }
+    ~DynamicKernelSlidingArgument()
+    {
+        if (mpClmem2)
+        {
+            clReleaseMemObject(mpClmem2);
+            mpClmem2 = NULL;
+        }
+    }
+
 protected:
     bool bIsStartFixed, bIsEndFixed;
+    const formula::DoubleVectorRefToken *mpDVR;
+    // from parent nodes
+    boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
+    // controls whether to invoke the reduction kernel during marshaling or not
+    bool needReductionKernel;
+    cl_mem mpClmem2;
 };
 
 /// Abstract class for code generation
@@ -439,6 +636,9 @@ protected:
 class Reduction: public SlidingFunctionBase
 {
 public:
+    typedef DynamicKernelSlidingArgument<DynamicKernelArgument> NumericRange;
+    typedef DynamicKernelSlidingArgument<DynamicKernelStringArgument> StringRange;
+
     virtual void GenSlidingWindowFunction(std::stringstream &ss,
             const std::string sSymName, SubArguments &vSubArguments)
     {
@@ -459,65 +659,48 @@ public:
         size_t nItems = 0;
         while (i--)
         {
-            FormulaToken *pCur = vSubArguments[i]->GetFormulaToken();
-            assert(pCur);
-            if (pCur->GetType() == formula::svDoubleVectorRef)
+            if (NumericRange *NR = dynamic_cast<NumericRange *> (vSubArguments[i].get()))
             {
-                const formula::DoubleVectorRefToken* pDVR =
-                dynamic_cast<const formula::DoubleVectorRefToken *>(pCur);
-                size_t nCurWindowSize = pDVR->GetRefRowSize();
-                ss << "for (int i = ";
-                if (!pDVR->IsStartFixed() && pDVR->IsEndFixed()) {
-#ifdef  ISNAN
-                    ss << "gid0; i < " << pDVR->GetArrayLength();
-                    ss << " && i < " << nCurWindowSize  << "; i++){\n\t\t";
-#else
-                    ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t";
-#endif
-                } else if (pDVR->IsStartFixed() && !pDVR->IsEndFixed()) {
-#ifdef  ISNAN
-                    ss << "0; i < " << pDVR->GetArrayLength();
-                    ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t";
-#else
-                    ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t";
-#endif
-                } else if (!pDVR->IsStartFixed() && !pDVR->IsEndFixed()){
-#ifdef  ISNAN
-                    ss << "0; i + gid0 < " << pDVR->GetArrayLength();
-                    ss << " &&  i < "<< nCurWindowSize << "; i++){\n\t\t";
-#else
-                    ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
-#endif
-                }
-                else
-                {
-                    ss << "0; i < "<< pDVR->GetArrayLength() << "; i++){\n\t\t";
-                }
-                nItems += nCurWindowSize;
+                bool needBody;
+                nItems += NR->GenLoop(ss, needBody);
+                if (needBody == false) continue;
             }
-            else if (pCur->GetType() == formula::svSingleVectorRef)
+            else if (StringRange *SR = dynamic_cast<StringRange *> (vSubArguments[i].get()))
             {
+                bool needBody;
+                nItems += SR->GenLoop(ss, needBody);   //did not handle yet
+                if (needBody == false) continue;
+            }
+            else
+            {
+                FormulaToken *pCur = vSubArguments[i]->GetFormulaToken();
+                assert(pCur);
+                assert(pCur->GetType() != formula::svDoubleVectorRef);
+
+                if (pCur->GetType() == formula::svSingleVectorRef)
+                {
 #ifdef  ISNAN
                     const formula::SingleVectorRefToken* pSVR =
-                    dynamic_cast< const formula::SingleVectorRefToken* >(pCur);
-                ss << "if (gid0 < " << pSVR->GetArrayLength() << "){\n\t\t";
+                        dynamic_cast< const formula::SingleVectorRefToken* >(pCur);
+                    ss << "if (gid0 < " << pSVR->GetArrayLength() << "){\n\t\t";
 #else
-                nItems += 1;
+                    nItems += 1;
 #endif
-            }
-            else if (pCur->GetType() == formula::svDouble)
-            {
+                }
+                else if (pCur->GetType() == formula::svDouble)
+                {
 #ifdef  ISNAN
-                ss << "{\n\t\t";
+                    ss << "{\n\t\t";
 #endif
-                nItems += 1;
-            }
-            else
-            {
+                    nItems += 1;
+                }
+                else
+                {
 #ifdef  ISNAN
-                ss << "nCount += 1;\n\t\t";
+                    ss << "nCount += 1;\n\t\t";
 #endif
-                nItems += 1;
+                    nItems += 1;
+                }
             }
 #ifdef  ISNAN
             if(ocPush==vSubArguments[i]->GetFormulaToken()->GetOpCode())
@@ -810,13 +993,13 @@ public:
         const std::string &s, const FormulaTreeNodeRef& ft, SlidingFunctionBase* pCodeGen);
 
     /// Create buffer and pass the buffer to a given kernel
-    virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth)
+    virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth, cl_program pProgram)
     {
         unsigned i = 0;
         for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
                 ++it)
         {
-            i += (*it)->Marshal(k, argno + i, nVectorWidth);
+            i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
         }
         return i;
     }
@@ -910,7 +1093,7 @@ public:
     }
 private:
     SubArgumentsType mvSubArguments;
-    boost::scoped_ptr<SlidingFunctionBase> mpCodeGen;
+    boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
 };
 
 boost::shared_ptr<DynamicKernelArgument> SoPHelper(
@@ -944,12 +1127,12 @@ DynamicKernelSoPArguments::DynamicKernelSoPArguments(
                     if (pDVR->GetArrays()[0].mpNumericArray)
                         mvSubArguments.push_back(
                                 SubArgument(new DynamicKernelSlidingArgument
-                                    <DynamicKernelArgument>(ts, ft->Children[i])));
+                                    <DynamicKernelArgument>(ts, ft->Children[i], mpCodeGen)));
                     else
                         mvSubArguments.push_back(
                                 SubArgument(new DynamicKernelSlidingArgument
                                     <DynamicKernelStringArgument>(
-                                        ts, ft->Children[i])));
+                                        ts, ft->Children[i], mpCodeGen)));
                 } else if (pChild->GetType() == formula::svSingleVectorRef) {
                     const formula::SingleVectorRefToken* pSVR =
                         dynamic_cast< const formula::SingleVectorRefToken* >(pChild);
@@ -1713,19 +1896,19 @@ public:
     }
     /// Memory mapping from host to device and pass buffers to the given kernel as
     /// arguments
-    void Marshal(cl_kernel, int);
+    void Marshal(cl_kernel, int, cl_program);
 private:
     unsigned int mCurId;
     ArgumentMap mSymbols;
     ArgumentList mParams;
 };
 
-void SymbolTable::Marshal(cl_kernel k, int nVectorWidth)
+void SymbolTable::Marshal(cl_kernel k, int nVectorWidth, cl_program pProgram)
 {
     int i = 1; //The first argument is reserved for results
     for(ArgumentList::iterator it = mParams.begin(), e= mParams.end(); it!=e;
             ++it) {
-        i+=(*it)->Marshal(k, i, nVectorWidth);
+        i+=(*it)->Marshal(k, i, nVectorWidth, pProgram);
     }
 }
 
@@ -1735,7 +1918,7 @@ public:
     DynamicKernel(FormulaTreeNodeRef r):mpRoot(r),
         mpProgram(NULL), mpKernel(NULL), mpResClmem(NULL) {}
     /// Code generation in OpenCL
-    std::string CodeGen() {
+    void CodeGen() {
         // Travese the tree of expression and declare symbols used
         const DynamicKernelArgument *DK= mSyms.DeclRefArg<
             DynamicKernelSoPArguments>(mpRoot, new OpNop);
@@ -1771,7 +1954,6 @@ public:
 #if 1
         std::cerr<< "Program to be compiled = \n" << mFullProgramSrc << "\n";
 #endif
-        return decl.str();
     }
     /// Produce kernel hash
     std::string GetMD5(void)
@@ -1817,7 +1999,7 @@ public:
         if (CL_SUCCESS != err)
             throw OpenCLError(err);
         // The rest of buffers
-        mSyms.Marshal(mpKernel, nr);
+        mSyms.Marshal(mpKernel, nr, mpProgram);
         size_t global_work_size[] = {nr};
         err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, mpKernel, 1, NULL,
             global_work_size, NULL, 0, NULL, NULL);
@@ -1993,7 +2175,7 @@ bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc,
     mpKernel = new DynamicKernel(Root);
 
     try {
-        std::string kSrc = mpKernel->CodeGen();
+        mpKernel->CodeGen();
         // Obtain cl context
         KernelEnv kEnv;
         OpenclDevice::setKernelEnv(&kEnv);
diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx
index 4898962..41e4587 100644
--- a/sc/source/core/opencl/opbase.hxx
+++ b/sc/source/core/opencl/opbase.hxx
@@ -92,7 +92,7 @@ public:
     virtual void GenDeclRef(std::stringstream &ss) const;
 
     /// Create buffer and pass the buffer to a given kernel
-    virtual size_t Marshal(cl_kernel, int, int);
+    virtual size_t Marshal(cl_kernel, int, int, cl_program);
 
     virtual ~DynamicKernelArgument();