[Libreoffice-commits] core.git: 2 commits - sc/source
Wei Wei
weiwei at multicorewareinc.com
Fri Nov 15 16:02:40 PST 2013
sc/source/core/opencl/formulagroupcl.cxx | 424 +++++++++++++++++++++++++------
sc/source/core/opencl/op_math.cxx | 225 +++++++++++++---
sc/source/core/opencl/op_math.hxx | 4
sc/source/core/opencl/opbase.cxx | 43 +++
sc/source/core/opencl/opbase.hxx | 8
5 files changed, 588 insertions(+), 116 deletions(-)
New commits:
commit 1e3bc2925c0ec1b03d6ae7cf3f281b0df3ec88d3
Author: Wei Wei <weiwei at multicorewareinc.com>
Date: Fri Nov 15 17:33:19 2013 -0600
GPU Calc: implemented parallel reduction for SUMIFS
For now only works for fixed and sliding fixed-sized windows.
Change-Id: I25e3f893a86d0e1723ae1e1633ffeeee93926b8d
Signed-off-by: I-Jui (Ray) Sung <ray at multicorewareinc.com>
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index e4b6bfb..79c33ae 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -419,42 +419,51 @@ public:
bIsStartFixed = mpDVR->IsStartFixed();
bIsEndFixed = mpDVR->IsEndFixed();
}
+ virtual bool NeedParallelReduction(void) const
+ {
+ return GetWindowSize()> 100 &&
+ ( (GetStartFixed() && GetEndFixed()) ||
+ (!GetStartFixed() && !GetEndFixed()) ) ;
+ }
virtual void GenSlidingWindowFunction(std::stringstream &ss) {
- std::string name = Base::GetName();
- ss << "__kernel void "<<name;
- ss << "_reduction(__global double* A, "
- "__global double *result,int arrayLength,int windowSize){\n";
- ss << " double tmp, current_result = 0.0;\n";
- ss << " int writePos = get_group_id(1);\n";
- ss << " int offset = get_group_id(1);\n";
- ss << " int lidx = get_local_id(0);\n";
- ss << " __local double shm_buf[256];\n";
- ss << " if (arrayLength == windowSize)\n";
- ss << " offset = 0;\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " int loop = arrayLength/512 + 1;\n";
- ss << " for (int l=0; l<loop; l++){\n";
- ss << " tmp = 0.0;\n";
- ss << " int loopOffset = l*512;\n";
- ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
- ss << " tmp = A[loopOffset + lidx + offset] + "
- "A[loopOffset + lidx + offset + 256];\n";
- ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
- ss << " tmp = A[loopOffset + lidx + offset];\n";
- ss << " shm_buf[lidx] = tmp;\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " for (int i = 128; i >0; i/=2) {\n";
- ss << " if (lidx < i)\n";
- ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " }\n";
- ss << " if (lidx == 0)\n";
- ss << " current_result += shm_buf[0];\n";
- ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
- ss << " }\n";
- ss << " if (lidx == 0)\n";
- ss << " result[writePos] = current_result;\n";
- ss << "}\n";
+ if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+ {
+ std::string name = Base::GetName();
+ ss << "__kernel void "<<name;
+ ss << "_reduction(__global double* A, "
+ "__global double *result,int arrayLength,int windowSize){\n";
+ ss << " double tmp, current_result = 0.0;\n";
+ ss << " int writePos = get_group_id(1);\n";
+ ss << " int offset = get_group_id(1);\n";
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ ss << " if (arrayLength == windowSize)\n";
+ ss << " offset = 0;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = 0.0;\n";
+ ss << " int loopOffset = l*512;\n";
+ ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
+ ss << " tmp = A[loopOffset + lidx + offset] + "
+ "A[loopOffset + lidx + offset + 256];\n";
+ ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
+ ss << " tmp = A[loopOffset + lidx + offset];\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result += shm_buf[0];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }
}
@@ -573,11 +582,16 @@ public:
if (CL_SUCCESS != err)
throw OpenCLError(err);
// reproduce the reduction function name
- std::string kernelName = Base::GetName() + "_reduction";
+ std::string kernelName;
+ if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+ kernelName = Base::GetName() + "_reduction";
+ else throw Unhandled();
+
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError(err);
// set kernel arg of reduction kernel
+ // TODO(Wei Wei): use unique name for kernel
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
(void *)&(Base::mpClmem));
if (CL_SUCCESS != err)
@@ -621,6 +635,14 @@ public:
}
}
+ size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); }
+
+ size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); }
+
+ size_t GetStartFixed(void) const {return bIsStartFixed; }
+
+ size_t GetEndFixed(void) const {return bIsEndFixed; }
+
protected:
bool bIsStartFixed, bIsEndFixed;
const formula::DoubleVectorRefToken *mpDVR;
@@ -1001,6 +1023,75 @@ public:
{
i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
}
+ if (OpSumIfs *OpSumCodeGen = dynamic_cast<OpSumIfs*>(mpCodeGen.get()))
+ {
+ assert(mpClmem == NULL);
+ // Obtain cl context
+ KernelEnv kEnv;
+ OpenclDevice::setKernelEnv(&kEnv);
+ cl_int err;
+ DynamicKernelSlidingArgument<DynamicKernelArgument> *slidingArgPtr =
+ dynamic_cast< DynamicKernelSlidingArgument<DynamicKernelArgument> *>
+ (mvSubArguments[0].get());
+ cl_mem mpClmem2;
+
+ if (OpSumCodeGen->NeedReductionKernel())
+ {
+ assert(slidingArgPtr);
+ size_t nInput = slidingArgPtr -> GetArrayLength();
+ size_t nCurWindowSize = slidingArgPtr -> GetWindowSize();
+ std::vector<cl_mem> vclmem;
+
+ for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
+ ++it)
+ {
+ vclmem.push_back((*it)->GetCLBuffer());
+ }
+ mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE,
+ sizeof(double)*nVectorWidth, NULL, &err);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ std::string kernelName = "SumIfs_reduction";
+ cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err);
+ if (err != CL_SUCCESS)
+ throw OpenCLError(err);
+
+ // set kernel arg of reduction kernel
+ for (size_t j=0; j< vclmem.size(); j++){
+ err = clSetKernelArg(redKernel, j, sizeof(cl_mem),
+ (void *)&vclmem[j]);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ }
+ err = clSetKernelArg(redKernel, vclmem.size(), sizeof(cl_mem), (void *)&mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, vclmem.size()+1, sizeof(cl_int), (void*)&nInput);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, vclmem.size()+2, sizeof(cl_int), (void*)&nCurWindowSize);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ // set work group size and execute
+ size_t global_work_size[] = {256, (size_t)nVectorWidth };
+ size_t local_work_size[] = {256, 1};
+ err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
+ global_work_size, local_work_size, 0, NULL, NULL);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ err = clFinish(kEnv.mpkCmdQueue);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ // Pass mpClmem2 to the "real" kernel
+ err = clSetKernelArg(k, argno, sizeof(cl_mem), (void *)&mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ }
+ }
return i;
}
diff --git a/sc/source/core/opencl/op_math.cxx b/sc/source/core/opencl/op_math.cxx
index 5e5c749..30eb759 100644
--- a/sc/source/core/opencl/op_math.cxx
+++ b/sc/source/core/opencl/op_math.cxx
@@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
size_t nCurWindowSize = pCurDVR->GetArrayLength() <
pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength():
pCurDVR->GetRefRowSize() ;
+
+ mNeedReductionKernel = vSubArguments[0]->NeedParallelReduction();
+ if (mNeedReductionKernel)
+ {
+ // generate reduction functions
+ ss << "__kernel void ";
+ ss << "SumIfs_reduction( ";
+ for (unsigned i = 0; i < vSubArguments.size(); i++)
+ {
+ if (i)
+ ss << ",";
+ vSubArguments[i]->GenSlidingWindowDecl(ss);
+ }
+ ss << ", __global double *result,int arrayLength,int windowSize";
+
+ ss << ")\n{\n";
+ ss << " double tmp =0;\n";
+ ss << " int i ;\n";
+
+ GenTmpVariables(ss,vSubArguments);
+ ss << " double current_result = 0.0;\n";
+ ss << " int writePos = get_group_id(1);\n";
+ if (pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed())
+ ss << " int offset = 0;\n";
+ else if (!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
+ ss << " int offset = get_group_id(1);\n";
+ else
+ throw Unhandled();
+ // actually unreachable
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = 0.0;\n";
+ ss << " int loopOffset = l*512;\n";
+
+ ss << " int p1 = loopOffset + lidx + offset, p2 = p1 + 256;\n";
+ ss << " if (p2 < min(offset + windowSize, arrayLength)) {\n";
+ ss << " tmp0 = 0.0;\n";
+ int mm=0;
+ std::string p1 = "p1";
+ std::string p2 = "p2";
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+ {
+ CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
+ CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
+ ss << "";
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << "))";
+ ss << "{\n";
+ }
+ CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
+ {
+ for(int n = 0;n<mm+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n\n";
+ }
+ mm=0;
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+ {
+ CheckSubArgumentIsNan2(ss,vSubArguments,j,p2);
+ CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p2);
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << ")){\n";
+ }
+ CheckSubArgumentIsNan2(ss,vSubArguments,0,p2);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j< vSubArguments.size();j+=2,mm--)
+ {
+ for(int n = 0;n<mm+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n";
+ }
+ ss << " }\n";
+
+ ss << " else if (p1 < min(arrayLength, offset + windowSize)) {\n";
+ mm=0;
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm++)
+ {
+ CheckSubArgumentIsNan2(ss,vSubArguments,j,p1);
+ CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1);
+
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << ")){\n";
+ }
+ CheckSubArgumentIsNan2(ss,vSubArguments,0,p1);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j<vSubArguments.size();j+=2,mm--)
+ {
+ for(int n = 0;n<mm+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n\n";
+ }
+
+ ss << " }\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result += shm_buf[0];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }// finish generate reduction code
+ // generate functions as usual
ss << "\ndouble " << sSymName;
ss << "_"<< BinFuncName() <<"(";
for (unsigned i = 0; i < vSubArguments.size(); i++)
@@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss,
ss << ")\n {\n";
ss <<" int gid0=get_global_id(0);\n";
ss << " double tmp =0;\n";
- ss << " int i ;\n";
- GenTmpVariables(ss,vSubArguments);
- ss << " for (i = ";
- if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
- ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
- } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
- ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
- } else {
- ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
- }
- ss << " {\n";
- if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
- {
- ss<< " int doubleIndex =i+gid0;\n";
- }else
- {
- ss<< " int doubleIndex =i;\n";
- }
- ss<< " int singleIndex =gid0;\n";
- int m=0;
- for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
- {
- CheckSubArgumentIsNan(ss,vSubArguments,j);
- CheckSubArgumentIsNan(ss,vSubArguments,j+1);
- ss <<" if(isequal(";
- ss <<"tmp";
- ss <<j;
- ss <<" , ";
- ss << "tmp";
- ss << j+1;
- ss << ")){\n";
- }
- CheckSubArgumentIsNan(ss,vSubArguments,0);
- ss << " tmp += tmp0;\n";
- for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
- {
- for(int n = 0;n<m+1;n++)
+ if (!mNeedReductionKernel)
+ {
+ ss << " int i ;\n";
+ GenTmpVariables(ss,vSubArguments);
+ ss << " for (i = ";
+ if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) {
+ ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n";
+ } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) {
+ ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n";
+ } else {
+ ss << "0; i < "<< nCurWindowSize <<"; i++)\n";
+ }
+ ss << " {\n";
+ if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed())
{
- ss << " ";
+ ss<< " int doubleIndex =i+gid0;\n";
+ }else
+ {
+ ss<< " int doubleIndex =i;\n";
}
- ss<< "}\n";
- }
+ ss<< " int singleIndex =gid0;\n";
+ int m=0;
+ for(unsigned j=1;j<vSubArguments.size();j+=2,m++)
+ {
+ CheckSubArgumentIsNan(ss,vSubArguments,j);
+ CheckSubArgumentIsNan(ss,vSubArguments,j+1);
+ ss <<" if(isequal(";
+ ss <<"tmp";
+ ss <<j;
+ ss <<" , ";
+ ss << "tmp";
+ ss << j+1;
+ ss << ")){\n";
+ }
+ CheckSubArgumentIsNan(ss,vSubArguments,0);
+ ss << " tmp += tmp0;\n";
+ for(unsigned j=1;j<=vSubArguments.size();j+=2,m--)
+ {
+ for(int n = 0;n<m+1;n++)
+ {
+ ss << " ";
+ }
+ ss<< "}\n";
+ }
+ }
+ if (mNeedReductionKernel)
+ {
+ ss << "tmp =";
+ vSubArguments[0]->GenDeclRef(ss);
+ ss << "[gid0];\n";
+ }
ss << "return tmp;\n";
ss << "}";
}
diff --git a/sc/source/core/opencl/op_math.hxx b/sc/source/core/opencl/op_math.hxx
index 01cbc82..7081b00 100644
--- a/sc/source/core/opencl/op_math.hxx
+++ b/sc/source/core/opencl/op_math.hxx
@@ -33,9 +33,13 @@ public:
class OpSumIfs: public CheckVariables
{
public:
+ OpSumIfs(void): CheckVariables(), mNeedReductionKernel(false) {}
virtual void GenSlidingWindowFunction(std::stringstream &ss,
const std::string sSymName, SubArguments &vSubArguments);
virtual std::string BinFuncName(void) const { return "SumIfs"; }
+ bool NeedReductionKernel(void) const { return mNeedReductionKernel; }
+protected:
+ bool mNeedReductionKernel;
};
class OpCosh: public Normal
{
diff --git a/sc/source/core/opencl/opbase.cxx b/sc/source/core/opencl/opbase.cxx
index 699833c..07425df 100644
--- a/sc/source/core/opencl/opbase.cxx
+++ b/sc/source/core/opencl/opbase.cxx
@@ -156,6 +156,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss,
ss << vSubArguments[i]->GenSlidingWindowDeclRef();
ss<<";\n";
}
+
+void CheckVariables::CheckSubArgumentIsNan2( std::stringstream & ss,
+ SubArguments &vSubArguments, int argumentNum, std::string p)
+{
+ int i = argumentNum;
+ if(vSubArguments[i]->GetFormulaToken()->GetType() == formula::svDouble)
+ {
+ ss <<" tmp";
+ ss <<i;
+ ss << "=";
+ vSubArguments[i]->GenDeclRef(ss);
+ ss<<";\n";
+ return;
+ }
+
+#ifdef ISNAN
+ ss<< " tmp";
+ ss<< i;
+ ss<< "= fsum(";
+ vSubArguments[i]->GenDeclRef(ss);
+ if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svDoubleVectorRef)
+ ss<<"["<< p.c_str()<< "]";
+ else if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svSingleVectorRef)
+ ss<<"[get_group_id(1)]";
+ ss<<", 0);\n";
+ return;
+#endif
+ ss <<" tmp";
+ ss <<i;
+ ss << "=";
+ vSubArguments[i]->GenDeclRef(ss);
+ if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svDoubleVectorRef)
+ ss<<"["<< p.c_str()<< "]";
+ else if(vSubArguments[i]->GetFormulaToken()->GetType() ==
+ formula::svSingleVectorRef)
+ ss<<"[get_group_id(1)]";
+
+ ss<<";\n";
+}
+
void CheckVariables::CheckAllSubArgumentIsNan(
std::stringstream & ss, SubArguments & vSubArguments)
{
diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx
index 41e4587..6b475df 100644
--- a/sc/source/core/opencl/opbase.hxx
+++ b/sc/source/core/opencl/opbase.hxx
@@ -104,6 +104,9 @@ public:
virtual void DumpInlineFun(std::set<std::string>& ,
std::set<std::string>& ) const {}
const std::string& GetName(void) const { return mSymName; }
+ cl_mem GetCLBuffer(void) const {return mpClmem; }
+ virtual bool NeedParallelReduction(void) const { return false; }
+
protected:
const std::string mSymName;
FormulaTreeNodeRef mFormulaTree;
@@ -157,6 +160,9 @@ public:
SubArguments &vSubArguments, int argumentNum);
void CheckAllSubArgumentIsNan(std::stringstream &ss,
SubArguments &vSubArguments);
+ // only check isNan
+ void CheckSubArgumentIsNan2(std::stringstream &ss,
+ SubArguments &vSubArguments, int argumentNum, std::string p);
};
}}
commit 501bc66c780ab8fde801eeedc1f7c89762050713
Author: I-Jui (Ray) Sung <ray at multicorewareinc.com>
Date: Fri Nov 15 17:54:08 2013 -0600
GPU Calc: use parallel reduction to implement sum
Use reduction kernel when given a large DoubleVectorRef
Change-Id: Ifd4977b81be64274733909e43f0e5ef161bb455e
Signed-off-by: I-Jui (Ray) Sung <ray at multicorewareinc.com>
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 3548480..e4b6bfb 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -44,7 +44,7 @@ namespace sc { namespace opencl {
/// Map the buffer used by an argument and do necessary argument setting
-size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int)
+size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int, cl_program)
{
FormulaToken *ref = mFormulaTree->GetFormulaToken();
assert(mpClmem == NULL);
@@ -125,7 +125,7 @@ public:
return 1;
}
/// Pass the 32-bit hash of the string to the kernel
- virtual size_t Marshal(cl_kernel k, int argno, int)
+ virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{
FormulaToken *ref = mFormulaTree->GetFormulaToken();
assert(mpClmem == NULL);
@@ -183,7 +183,7 @@ public:
return 1;
}
/// Create buffer and pass the buffer to a given kernel
- virtual size_t Marshal(cl_kernel k, int argno, int)
+ virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{
double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel
@@ -222,7 +222,7 @@ public:
return 1;
}
/// Create buffer and pass the buffer to a given kernel
- virtual size_t Marshal(cl_kernel k, int argno, int)
+ virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{
double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel
@@ -264,7 +264,7 @@ public:
return 1;
}
/// Create buffer and pass the buffer to a given kernel
- virtual size_t Marshal(cl_kernel k, int argno, int)
+ virtual size_t Marshal(cl_kernel k, int argno, int, cl_program)
{
double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel
@@ -292,11 +292,11 @@ public:
{
DynamicKernelStringArgument::GenDecl(ss);
}
- virtual size_t Marshal(cl_kernel, int, int);
+ virtual size_t Marshal(cl_kernel, int, int, cl_program);
};
/// Marshal a string vector reference
-size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int)
+size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int, cl_program)
{
FormulaToken *ref = mFormulaTree->GetFormulaToken();
assert(mpClmem == NULL);
@@ -390,10 +390,10 @@ public:
ss << ")";
return ss.str();
}
- virtual size_t Marshal(cl_kernel k, int argno, int vw)
+ virtual size_t Marshal(cl_kernel k, int argno, int vw, cl_program p)
{
- int i = DynamicKernelArgument::Marshal(k, argno, vw);
- i += mStringArgument.Marshal(k, argno+i, vw);
+ int i = DynamicKernelArgument::Marshal(k, argno, vw, p);
+ i += mStringArgument.Marshal(k, argno+i, vw, p);
return i;
}
protected:
@@ -402,24 +402,61 @@ protected:
/// Handling a Double Vector that is used as a sliding window input
/// to either a sliding window average or sum-of-products
+class OpSum; // Forward Declaration
template<class Base>
class DynamicKernelSlidingArgument: public Base
{
public:
DynamicKernelSlidingArgument(const std::string &s,
- FormulaTreeNodeRef ft):
- Base(s, ft)
+ FormulaTreeNodeRef ft, boost::shared_ptr<SlidingFunctionBase> &CodeGen):
+ Base(s, ft), mpCodeGen(CodeGen), needReductionKernel(true), mpClmem2(NULL)
{
FormulaToken *t = ft->GetFormulaToken();
if (t->GetType() != formula::svDoubleVectorRef)
throw Unhandled();
- const formula::DoubleVectorRefToken* pDVR =
- dynamic_cast<const formula::DoubleVectorRefToken *>(t);
- assert(pDVR);
- bIsStartFixed = pDVR->IsStartFixed();
- bIsEndFixed = pDVR->IsEndFixed();
+ mpDVR = dynamic_cast<const formula::DoubleVectorRefToken *>(t);
+ assert(mpDVR);
+ bIsStartFixed = mpDVR->IsStartFixed();
+ bIsEndFixed = mpDVR->IsEndFixed();
}
- virtual void GenSlidingWindowFunction(std::stringstream &) {}
+ virtual void GenSlidingWindowFunction(std::stringstream &ss) {
+ std::string name = Base::GetName();
+ ss << "__kernel void "<<name;
+ ss << "_reduction(__global double* A, "
+ "__global double *result,int arrayLength,int windowSize){\n";
+ ss << " double tmp, current_result = 0.0;\n";
+ ss << " int writePos = get_group_id(1);\n";
+ ss << " int offset = get_group_id(1);\n";
+ ss << " int lidx = get_local_id(0);\n";
+ ss << " __local double shm_buf[256];\n";
+ ss << " if (arrayLength == windowSize)\n";
+ ss << " offset = 0;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " int loop = arrayLength/512 + 1;\n";
+ ss << " for (int l=0; l<loop; l++){\n";
+ ss << " tmp = 0.0;\n";
+ ss << " int loopOffset = l*512;\n";
+ ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n";
+ ss << " tmp = A[loopOffset + lidx + offset] + "
+ "A[loopOffset + lidx + offset + 256];\n";
+ ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n";
+ ss << " tmp = A[loopOffset + lidx + offset];\n";
+ ss << " shm_buf[lidx] = tmp;\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " for (int i = 128; i >0; i/=2) {\n";
+ ss << " if (lidx < i)\n";
+ ss << " shm_buf[lidx] += shm_buf[lidx + i];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " current_result += shm_buf[0];\n";
+ ss << " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ ss << " }\n";
+ ss << " if (lidx == 0)\n";
+ ss << " result[writePos] = current_result;\n";
+ ss << "}\n";
+ }
+
virtual std::string GenSlidingWindowDeclRef(bool=false) const
{
@@ -430,8 +467,168 @@ public:
ss << Base::GetName() << "[i]";
return ss.str();
}
+ /// Controls how the elements in the DoubleVectorRef are traversed
+ virtual size_t GenLoop(std::stringstream &ss, bool &needBody)
+ {
+ assert(mpDVR);
+ size_t nCurWindowSize = mpDVR->GetRefRowSize();
+ if (dynamic_cast<OpSum*>(mpCodeGen.get()))
+ {
+ if (!bIsStartFixed && !bIsEndFixed)
+ {
+ // set 100 as a threshold for invoking reduction kernel
+ if (nCurWindowSize > 100 )
+ {
+ std::string temp = Base::GetName() + "[gid0]";
+ ss << "tmp = ";
+ ss << mpCodeGen->Gen2(temp, "tmp");
+ ss << ";\n\t";
+ needBody = false;
+ needReductionKernel = false;
+ return nCurWindowSize;
+ }
+ }
+
+ if (bIsStartFixed && bIsEndFixed)
+ {
+ // set 100 as a threshold for invoking reduction kernel
+ if (nCurWindowSize > 100 )
+ {
+ std::string temp = Base::GetName() + "[0]";
+ ss << "tmp = ";
+ ss << mpCodeGen->Gen2(temp, "tmp");
+ ss << ";\n\t";
+ needBody = false;
+ needReductionKernel = false;
+ return nCurWindowSize;
+ }
+ }
+ }
+ needBody = true;
+ needReductionKernel = true;
+ ss << "for (int i = ";
+ if (!bIsStartFixed && bIsEndFixed)
+ {
+#ifdef ISNAN
+ ss << "gid0; i < " << mpDVR->GetArrayLength();
+ ss << " && i < " << nCurWindowSize << "; i++){\n\t\t";
+#else
+ ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+ }
+ else if (bIsStartFixed && !bIsEndFixed)
+ {
+#ifdef ISNAN
+ ss << "0; i < " << mpDVR->GetArrayLength();
+ ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t";
+#else
+ ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+ }
+ else if (!bIsStartFixed && !bIsEndFixed)
+ {
+#ifdef ISNAN
+ ss << "0; i + gid0 < " << mpDVR->GetArrayLength();
+ ss << " && i < "<< nCurWindowSize << "; i++){\n\t\t";
+#else
+ ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+ }
+ else
+ {
+#ifdef ISNAN
+ ss << "0; i < "<< nCurWindowSize << "; i++){\n\t\t";
+#else
+ ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
+#endif
+ }
+
+ return nCurWindowSize;
+ }
+
+ virtual size_t Marshal(cl_kernel k, int argno, int w, cl_program mpProgram)
+ {
+ if (needReductionKernel)
+ return Base::Marshal(k, argno, w, mpProgram);
+
+ assert(Base::mpClmem == NULL);
+ // Obtain cl context
+ KernelEnv kEnv;
+ OpenclDevice::setKernelEnv(&kEnv);
+ cl_int err;
+ size_t nInput = mpDVR->GetArrayLength();
+ size_t nCurWindowSize = mpDVR->GetRefRowSize();
+ // create clmem buffer
+ if (mpDVR->GetArrays()[0].mpNumericArray == NULL)
+ throw Unhandled();
+ double *pHostBuffer = const_cast<double*>(
+ mpDVR->GetArrays()[0].mpNumericArray);
+ size_t szHostBuffer = nInput * sizeof(double);
+ Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
+ (cl_mem_flags) CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,
+ szHostBuffer,
+ pHostBuffer, &err);
+ mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_WRITE_ONLY,
+ sizeof(double)*w, NULL, NULL);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ // reproduce the reduction function name
+ std::string kernelName = Base::GetName() + "_reduction";
+ cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
+ if (err != CL_SUCCESS)
+ throw OpenCLError(err);
+ // set kernel arg of reduction kernel
+ err = clSetKernelArg(redKernel, 0, sizeof(cl_mem),
+ (void *)&(Base::mpClmem));
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), (void *)&mpClmem2);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, 2, sizeof(cl_int), (void*)&nInput);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ err = clSetKernelArg(redKernel, 3, sizeof(cl_int), (void*)&nCurWindowSize);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ // set work group size and execute
+ size_t global_work_size[] = {256, (size_t)w };
+ size_t local_work_size[] = {256, 1};
+ err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL,
+ global_work_size, local_work_size, 0, NULL, NULL);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ err = clFinish(kEnv.mpkCmdQueue);
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+
+ // set kernel arg
+ err = clSetKernelArg(k, argno, sizeof(cl_mem), (void*)&(mpClmem2));
+ if (CL_SUCCESS != err)
+ throw OpenCLError(err);
+ return 1;
+ }
+ ~DynamicKernelSlidingArgument()
+ {
+ if (mpClmem2)
+ {
+ clReleaseMemObject(mpClmem2);
+ mpClmem2 = NULL;
+ }
+ }
+
protected:
bool bIsStartFixed, bIsEndFixed;
+ const formula::DoubleVectorRefToken *mpDVR;
+ // from parent nodes
+ boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
+ // controls whether to invoke the reduction kernel during marshaling or not
+ bool needReductionKernel;
+ cl_mem mpClmem2;
};
/// Abstract class for code generation
@@ -439,6 +636,9 @@ protected:
class Reduction: public SlidingFunctionBase
{
public:
+ typedef DynamicKernelSlidingArgument<DynamicKernelArgument> NumericRange;
+ typedef DynamicKernelSlidingArgument<DynamicKernelStringArgument> StringRange;
+
virtual void GenSlidingWindowFunction(std::stringstream &ss,
const std::string sSymName, SubArguments &vSubArguments)
{
@@ -459,65 +659,48 @@ public:
size_t nItems = 0;
while (i--)
{
- FormulaToken *pCur = vSubArguments[i]->GetFormulaToken();
- assert(pCur);
- if (pCur->GetType() == formula::svDoubleVectorRef)
+ if (NumericRange *NR = dynamic_cast<NumericRange *> (vSubArguments[i].get()))
{
- const formula::DoubleVectorRefToken* pDVR =
- dynamic_cast<const formula::DoubleVectorRefToken *>(pCur);
- size_t nCurWindowSize = pDVR->GetRefRowSize();
- ss << "for (int i = ";
- if (!pDVR->IsStartFixed() && pDVR->IsEndFixed()) {
-#ifdef ISNAN
- ss << "gid0; i < " << pDVR->GetArrayLength();
- ss << " && i < " << nCurWindowSize << "; i++){\n\t\t";
-#else
- ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t";
-#endif
- } else if (pDVR->IsStartFixed() && !pDVR->IsEndFixed()) {
-#ifdef ISNAN
- ss << "0; i < " << pDVR->GetArrayLength();
- ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t";
-#else
- ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t";
-#endif
- } else if (!pDVR->IsStartFixed() && !pDVR->IsEndFixed()){
-#ifdef ISNAN
- ss << "0; i + gid0 < " << pDVR->GetArrayLength();
- ss << " && i < "<< nCurWindowSize << "; i++){\n\t\t";
-#else
- ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t";
-#endif
- }
- else
- {
- ss << "0; i < "<< pDVR->GetArrayLength() << "; i++){\n\t\t";
- }
- nItems += nCurWindowSize;
+ bool needBody;
+ nItems += NR->GenLoop(ss, needBody);
+ if (needBody == false) continue;
}
- else if (pCur->GetType() == formula::svSingleVectorRef)
+ else if (StringRange *SR = dynamic_cast<StringRange *> (vSubArguments[i].get()))
{
+ bool needBody;
+ nItems += SR->GenLoop(ss, needBody); //did not handle yet
+ if (needBody == false) continue;
+ }
+ else
+ {
+ FormulaToken *pCur = vSubArguments[i]->GetFormulaToken();
+ assert(pCur);
+ assert(pCur->GetType() != formula::svDoubleVectorRef);
+
+ if (pCur->GetType() == formula::svSingleVectorRef)
+ {
#ifdef ISNAN
const formula::SingleVectorRefToken* pSVR =
- dynamic_cast< const formula::SingleVectorRefToken* >(pCur);
- ss << "if (gid0 < " << pSVR->GetArrayLength() << "){\n\t\t";
+ dynamic_cast< const formula::SingleVectorRefToken* >(pCur);
+ ss << "if (gid0 < " << pSVR->GetArrayLength() << "){\n\t\t";
#else
- nItems += 1;
+ nItems += 1;
#endif
- }
- else if (pCur->GetType() == formula::svDouble)
- {
+ }
+ else if (pCur->GetType() == formula::svDouble)
+ {
#ifdef ISNAN
- ss << "{\n\t\t";
+ ss << "{\n\t\t";
#endif
- nItems += 1;
- }
- else
- {
+ nItems += 1;
+ }
+ else
+ {
#ifdef ISNAN
- ss << "nCount += 1;\n\t";
+ ss << "nCount += 1;\n\t\t";
#endif
- nItems += 1;
+ nItems += 1;
+ }
}
#ifdef ISNAN
if(ocPush==vSubArguments[i]->GetFormulaToken()->GetOpCode())
@@ -810,13 +993,13 @@ public:
const std::string &s, const FormulaTreeNodeRef& ft, SlidingFunctionBase* pCodeGen);
/// Create buffer and pass the buffer to a given kernel
- virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth)
+ virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth, cl_program pProgram)
{
unsigned i = 0;
for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e;
++it)
{
- i += (*it)->Marshal(k, argno + i, nVectorWidth);
+ i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram);
}
return i;
}
@@ -910,7 +1093,7 @@ public:
}
private:
SubArgumentsType mvSubArguments;
- boost::scoped_ptr<SlidingFunctionBase> mpCodeGen;
+ boost::shared_ptr<SlidingFunctionBase> mpCodeGen;
};
boost::shared_ptr<DynamicKernelArgument> SoPHelper(
@@ -944,12 +1127,12 @@ DynamicKernelSoPArguments::DynamicKernelSoPArguments(
if (pDVR->GetArrays()[0].mpNumericArray)
mvSubArguments.push_back(
SubArgument(new DynamicKernelSlidingArgument
- <DynamicKernelArgument>(ts, ft->Children[i])));
+ <DynamicKernelArgument>(ts, ft->Children[i], mpCodeGen)));
else
mvSubArguments.push_back(
SubArgument(new DynamicKernelSlidingArgument
<DynamicKernelStringArgument>(
- ts, ft->Children[i])));
+ ts, ft->Children[i], mpCodeGen)));
} else if (pChild->GetType() == formula::svSingleVectorRef) {
const formula::SingleVectorRefToken* pSVR =
dynamic_cast< const formula::SingleVectorRefToken* >(pChild);
@@ -1713,19 +1896,19 @@ public:
}
/// Memory mapping from host to device and pass buffers to the given kernel as
/// arguments
- void Marshal(cl_kernel, int);
+ void Marshal(cl_kernel, int, cl_program);
private:
unsigned int mCurId;
ArgumentMap mSymbols;
ArgumentList mParams;
};
-void SymbolTable::Marshal(cl_kernel k, int nVectorWidth)
+void SymbolTable::Marshal(cl_kernel k, int nVectorWidth, cl_program pProgram)
{
int i = 1; //The first argument is reserved for results
for(ArgumentList::iterator it = mParams.begin(), e= mParams.end(); it!=e;
++it) {
- i+=(*it)->Marshal(k, i, nVectorWidth);
+ i+=(*it)->Marshal(k, i, nVectorWidth, pProgram);
}
}
@@ -1816,7 +1999,7 @@ public:
if (CL_SUCCESS != err)
throw OpenCLError(err);
// The rest of buffers
- mSyms.Marshal(mpKernel, nr);
+ mSyms.Marshal(mpKernel, nr, mpProgram);
size_t global_work_size[] = {nr};
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, mpKernel, 1, NULL,
global_work_size, NULL, 0, NULL, NULL);
diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx
index 4898962..41e4587 100644
--- a/sc/source/core/opencl/opbase.hxx
+++ b/sc/source/core/opencl/opbase.hxx
@@ -92,7 +92,7 @@ public:
virtual void GenDeclRef(std::stringstream &ss) const;
/// Create buffer and pass the buffer to a given kernel
- virtual size_t Marshal(cl_kernel, int, int);
+ virtual size_t Marshal(cl_kernel, int, int, cl_program);
virtual ~DynamicKernelArgument();
More information about the Libreoffice-commits
mailing list