[Libreoffice-commits] core.git: sc/source
Haidong Lian
haidong at multicorewareinc.com
Fri Aug 30 12:57:34 PDT 2013
sc/source/core/opencl/formulagroupcl.cxx | 1151 ++++++++++++++++++--------
sc/source/core/opencl/oclkernels.hxx | 239 ++++-
sc/source/core/opencl/openclwrapper.cxx | 1361 +++++++++++++++++++++++--------
sc/source/core/opencl/openclwrapper.hxx | 182 ++--
4 files changed, 2189 insertions(+), 744 deletions(-)
New commits:
commit e791fbfc0435f4a9522288154132df2760ef14a2
Author: Haidong Lian <haidong at multicorewareinc.com>
Date: Fri Aug 30 15:35:17 2013 -0400
Patch for milestone1-0829-v4.
1. Add the parser based on RPN;
2. For test sample1 named "ground-water-daily.xls", using the compound formula
to do calculation;
Add the compound kernels:
Formulae include "AVERAGE,MAX and MIN".Compound formulae include "AVERAGE
-(+,*,/)","MAX -(+,*,/)" and "MIN -(+,*,/)";
3. For formulae which do not work in GPU, they'll work in CPU;
4. For compound operators(-,+,*,/), they'll be calculated one by one in GPU as
the sequence of RPN;
5. Add the start and end position to fit for the sliding window;
6. Modify kernels by using vector for AMD GPU.
Conflicts:
sc/source/core/opencl/formulagroupcl.cxx
sc/source/core/opencl/openclwrapper.cxx
Change-Id: I6157008575ce89ddd3e7bf552a87812474af4125
diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx
index 274af4e..a835c46 100644
--- a/sc/source/core/opencl/formulagroupcl.cxx
+++ b/sc/source/core/opencl/formulagroupcl.cxx
@@ -19,6 +19,10 @@
#include "openclwrapper.hxx"
+#define SRCDATASIZE 100
+#define SINGLEARRAYLEN 100
+#define DOUBLEARRAYLEN 100
+#define SVDOUBLELEN 100
namespace sc {
// A single public entry point for a factory function:
@@ -38,30 +42,632 @@ double getTimeDiff(const TimeValue& t1, const TimeValue& t2)
}//dbg-t
TimeValue aTimeBefore, aTimeAfter;
///////////////////////////////////////
+class SourceData
+{
+ const double *mdpSrcPtr;
+ unsigned int mnDataSize;
+ const char *mcpFormulaName;
+ unsigned int mnCol;
+ int eOp;
+public:
+ SourceData( const double *dpData, unsigned int nSize, uint nCol = 1,const char *cpFormulaName = NULL):mdpSrcPtr(dpData),mnDataSize(nSize),mcpFormulaName(cpFormulaName),mnCol(nCol)
+ {
+ }
+ SourceData():mdpSrcPtr(NULL),mnDataSize(0)
+ {
+ }
+ void setSrcPtr( const double *dpTmpDataPtr)
+ {
+ mdpSrcPtr = dpTmpDataPtr;
+ }
+ void setSrcSize( int nSize )
+ {
+ mnDataSize = nSize;
+ }
+ const double * getDouleData()
+ {
+ return mdpSrcPtr;
+ }
+ unsigned int getDataSize()
+ {
+ return mnDataSize;
+ }
+ void print()
+ {
+ for( uint i=0; i<mnDataSize; i++ )
+ printf( " The SourceData is %f and data size is %d\n",mdpSrcPtr[i],mnDataSize );
+ }
+ void printFormula()
+ {
+ printf("--------The formulaname is %s and the eOp is %d---------\n",mcpFormulaName,eOp);
+ }
+ void setFormulaName(const char *cpFormulaName)
+ {
+ this->mcpFormulaName = cpFormulaName;
+ }
+ const char *getFormulaName()
+ {
+ return mcpFormulaName;
+ }
+ void seteOp(int op)
+ {
+ this->eOp = op;
+ }
+ int geteOp()
+ {
+ return eOp;
+ }
+ int getColNum()
+ {
+ return mnCol;
+ }
+
+};
class FormulaGroupInterpreterOpenCL : public FormulaGroupInterpreterSoftware
{
+ SourceData *mSrcDataStack[SRCDATASIZE];
+ unsigned int mnStackPointer,mnDoublePtrCount;
+ uint * mnpOclStartPos;
+ uint * mnpOclEndPos;
+ SingleVectorFormula *mSingleArray[SINGLEARRAYLEN];
+ DoubleVectorFormula *mDoubleArray[DOUBLEARRAYLEN];
+ double mdpSvdouble[SVDOUBLELEN];
+ double *mdpSrcDoublePtr[SVDOUBLELEN];
+ uint mnSingleCount;
+ uint mnDoubleCount;
+ uint mnSvDoubleCount;
+ uint mnOperatorGroup[100];
+ uint mnOperatorCount;
+ char mcHostName[100];
+ uint mnPositonLen;
+ size_t mnRowSize;
public:
FormulaGroupInterpreterOpenCL() :
FormulaGroupInterpreterSoftware()
{
- OclCalc::InitEnv();
+ mnStackPointer = 0;
+ mnpOclEndPos = NULL;
+ mnpOclStartPos = NULL;
+ mnSingleCount = 0;
+ mnDoubleCount = 0;
+ mnSvDoubleCount = 0;
+ mnOperatorCount = 0;
+ mnPositonLen = 0;
+ mnDoublePtrCount = 0;
+ OclCalc::initEnv();
}
virtual ~FormulaGroupInterpreterOpenCL()
{
- OclCalc::ReleaseOpenclRunEnv();
+ OclCalc::releaseOpenclRunEnv();
}
- virtual ScMatrixRef inverseMatrix(const ScMatrix& rMat);
- virtual bool interpret(ScDocument& rDoc, const ScAddress& rTopPos,
- const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode);
+ virtual ScMatrixRef inverseMatrix( const ScMatrix& rMat );
+ virtual bool interpret( ScDocument& rDoc, const ScAddress& rTopPos,
+ const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode );
+ void collectDoublePointers( double *temp )
+ {
+ if( mnDoublePtrCount < SRCDATASIZE )
+ {
+ mdpSrcDoublePtr[mnDoublePtrCount++] = temp;
+ }
+ else
+ {
+ printf( "The mdpSrcDoublePtr is full now.\n" );
+ double *dtmp = NULL;
+ if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
+ {
+ free( dtmp );
+ dtmp = NULL;
+ }
+ }
+ }
+
+ void freeDoublePointers()
+ {
+ while( mnDoublePtrCount > 0 )
+ {
+ double *dtmp = NULL;
+ if ( (dtmp = mdpSrcDoublePtr[--mnDoublePtrCount]) != NULL )
+ {
+ free( dtmp );
+ dtmp = NULL;
+ }
+ }
+ }
+
+
+ void srdDataPush( SourceData *temp )
+ {
+ if( mnStackPointer < SRCDATASIZE )
+ {
+ mSrcDataStack[mnStackPointer++] = temp;
+ }
+ else
+ printf( "The stack is full now.\n" );
+ }
+ SourceData *srdDataPop( void )
+ {
+ if( mnStackPointer <= 0 )
+ {
+ printf( "The stack was empty\n" );
+ return NULL;
+ }
+ return mSrcDataStack[--mnStackPointer];
+ }
+ unsigned int getDataSize()
+ {
+ return mnStackPointer;
+ }
+ void printStackInfo()
+ {
+ printf( "/********The stack size is %d*********\\\n",mnStackPointer );
+ for ( int i = mnStackPointer - 1; i >= 0; i-- )
+ mSrcDataStack[i]->print();
+ }
+ bool getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen);
+ bool chooseFunction(OclCalc &ocl_calc,double *&dpResult);
+ bool isStockHistory();
+ bool isGroundWater();
+};
+bool FormulaGroupInterpreterOpenCL::getPosition(const ScTokenArray& rCode,const ScFormulaCellGroupRef& xGroup,uint nRowSize,uint *&npOclStartPos,uint *&npOclEndPos,uint *nPositonLen)
+{
+ uint nColPosition = 0;
+ ScTokenArray * rCodePos = rCode.Clone();
+ static int nCountPosSize = nRowSize;
+ bool isAllocFormulaOclBuf = true;
+ for ( const formula::FormulaToken* p = rCodePos->First(); p; p = rCodePos->Next() )
+ {
+ switch ( p->GetType() )
+ {
+ case formula::svDoubleVectorRef:
+ {
+ nColPosition++;
+ break;
+ }
+ }
+ }
+ int nPositionSize = nColPosition * nRowSize;
+ npOclStartPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
+ npOclEndPos = (unsigned int*) malloc( nPositionSize * sizeof(unsigned int) );
+ if ( nCountPosSize < nPositionSize )
+ {
+ nCountPosSize = nPositionSize;
+ isAllocFormulaOclBuf = false;
+ }
+ for ( sal_Int32 i = 0; i < xGroup->mnLength; ++i )
+ {
+ ScTokenArray * rCodeTemp = rCode.Clone();
+ int j = 0;
+ for ( const formula::FormulaToken* p = rCodeTemp->First(); p; p = rCodeTemp->Next() )
+ {
+ switch (p->GetType())
+ {
+ case formula::svDoubleVectorRef:
+ {
+ const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p);
+ size_t nRowStart = p2->IsStartFixed() ? 0 : i;
+ size_t nRowEnd = p2->GetRefRowSize() - 1;
+ if (!p2->IsEndFixed())
+ nRowEnd += i;
+ npOclStartPos[j*nRowSize+i] = nRowStart;//record the start position
+ npOclEndPos[j*nRowSize+i] = nRowEnd;//record the end position
+ j++;
+ }
+ }
+ }
+ }
+ *nPositonLen = nPositionSize;
+ //Now the pos array is 0 1 2 3 4 5 0 1 2 3 4 5;
+ return isAllocFormulaOclBuf;
+}
+
+bool FormulaGroupInterpreterOpenCL::isStockHistory()
+{
+ bool isHistory = false;
+ if( (mnOperatorGroup[0]== 224) && (mnOperatorGroup[1]== 227) && (mnOperatorGroup[2]== 41) && (mnOperatorGroup[3]== 43) && (mnOperatorGroup[4]== 41) )
+ {
+ strcpy(mcHostName,"OclOperationColumnN");
+ isHistory = true;
+ }
+ else if( (mnOperatorGroup[0] == 226) && (mnOperatorGroup[1] == 42) )
+ {
+ strcpy(mcHostName,"OclOperationColumnH");
+ isHistory = true;
+ }
+ else if((mnOperatorGroup[0] == 213) && (mnOperatorGroup[1] == 43) && (mnOperatorGroup[2] == 42) )
+ {
+ strcpy(mcHostName,"OclOperationColumnJ");
+ isHistory = true;
+ }
+ return isHistory;
+}
+
+bool FormulaGroupInterpreterOpenCL::isGroundWater()
+{
+ bool GroundWater=false;
+
+ if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
+ (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
+ {
+ GroundWater = true;
+ }
+ return GroundWater;
+}
+
+bool FormulaGroupInterpreterOpenCL::chooseFunction( OclCalc &ocl_calc, double *&dpResult )
+{
+ const double * dpOclSrcData = NULL;
+ unsigned int nSrcDataSize = 0;
+ const double *dpLeftData = NULL;
+ const double *dpRightData = NULL;
+ if((mnOperatorGroup[0] == ocAverage && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocMax && 1 == mnSingleCount )||
+ (mnOperatorGroup[0] == ocMin && 1 == mnSingleCount )||(mnOperatorGroup[0] == ocSub && mnSvDoubleCount==1))
+ {
+ double delta = 0.0;
+ const double *pArrayToSubtractOneElementFrom;
+ const double *pGroundWaterDataArray;
+ uint nSrcData = 0;
+ if( mnSvDoubleCount!=1 )
+ {
+ pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
+ pGroundWaterDataArray= mDoubleArray[0]->mdpInputData;
+ nSrcData = mDoubleArray[0]->mnInputDataSize;
+ }
+ else
+ {
+ pArrayToSubtractOneElementFrom= mSingleArray[0]->mdpInputLeftData;
+ pGroundWaterDataArray=NULL;
+ delta = mdpSvdouble[0];
+ }
+ ocl_calc.oclGroundWaterGroup( mnOperatorGroup,mnOperatorCount,pGroundWaterDataArray,pArrayToSubtractOneElementFrom,nSrcData,mnRowSize,delta,mnpOclStartPos,mnpOclEndPos,dpResult);
+ }
+ else if( isStockHistory() )
+ {
+ return false;
+ }
+ else if(((mnSvDoubleCount==0)&&(mnSingleCount==0)&&(mnDoubleCount==1)) &&
+ ((mnOperatorGroup[0] == ocAverage)||(mnOperatorGroup[0] == ocMax)||(mnOperatorGroup[0] == ocMin)))
+ {
+ if(mnOperatorGroup[0] == ocAverage)
+ strcpy(mcHostName,"oclFormulaAverage");
+ if(mnOperatorGroup[0] == ocMax)
+ strcpy(mcHostName,"oclFormulaMax");
+ if(mnOperatorGroup[0] == ocMin)
+ strcpy(mcHostName,"oclFormulaMin");
+ DoubleVectorFormula * doubleTemp = mDoubleArray[--mnDoubleCount];
+ nSrcDataSize = doubleTemp->mnInputDataSize;
+ dpOclSrcData = doubleTemp->mdpInputData;
+ if ( ocl_calc.getOpenclState())
+ {
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ ocl_calc.createFormulaBuf64Bits( nSrcDataSize, mnRowSize );
+ ocl_calc.mapAndCopy64Bits( dpOclSrcData,mnpOclStartPos,mnpOclEndPos,nSrcDataSize,mnRowSize );
+ ocl_calc.oclHostFormulaStatistics64Bits( mcHostName, dpResult, mnRowSize );
+ }
+ else
+ {
+ ocl_calc.createFormulaBuf32Bits( nSrcDataSize, mnPositonLen );
+ ocl_calc.mapAndCopy32Bits( dpOclSrcData, mnpOclStartPos, mnpOclEndPos, nSrcDataSize, mnRowSize);
+ ocl_calc.oclHostFormulaStatistics32Bits( mcHostName, dpResult, mnRowSize );
+ }
+ }
+ }
+ else if((mnSvDoubleCount==0)&&(mnSingleCount==1)&&(mnDoubleCount==0))
+ {
+ dpLeftData = mSingleArray[0]->mdpInputLeftData;
+ dpRightData = mSingleArray[0]->mdpInputRightData;
+ if(mnOperatorGroup[0] == ocAdd)
+ strcpy(mcHostName,"oclSignedAdd");
+ if(mnOperatorGroup[0] == ocSub)
+ strcpy(mcHostName,"oclSignedSub");
+ if(mnOperatorGroup[0] == ocMul)
+ strcpy(mcHostName,"oclSignedMul");
+ if(mnOperatorGroup[0] == ocDiv)
+ strcpy(mcHostName,"oclSignedDiv");
+ if ( ocl_calc.getOpenclState())
+ {
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ ocl_calc.createArithmeticOptBuf64Bits( mnRowSize );
+ ocl_calc.mapAndCopy64Bits(dpLeftData,dpRightData,mnRowSize);
+ ocl_calc.oclHostArithmeticOperator64Bits( mcHostName,dpResult,mnRowSize );
+ }
+ else
+ {
+ ocl_calc.createArithmeticOptBuf32Bits( mnRowSize );
+ ocl_calc.mapAndCopy32Bits(dpLeftData,dpRightData,mnRowSize);
+ ocl_calc.oclHostArithmeticOperator32Bits( mcHostName,dpResult,mnRowSize );
+ }
+ }
+ }
+ else if( (mnSingleCount>1) && (mnSvDoubleCount==0) && (mnDoubleCount==0) )
+ {
+ const double* dpArray[100] = {};
+ int j=0;
+ for( uint i = 0; i < mnSingleCount; i++ )
+ {
+ dpArray[j++] = mSingleArray[i]->mdpInputLeftData;
+ if( NULL != mSingleArray[i]->mdpInputRightData )
+ dpArray[j++] = mSingleArray[i]->mdpInputRightData;
+ }
+ double *dpMoreColArithmetic = (double *)malloc( sizeof(double) * j * mnRowSize );
+ if( NULL == dpMoreColArithmetic )
+ {
+ printf("Memory alloc error!\n");
+ return false;
+ }
+ for( uint i = 0; i < j*mnRowSize; i++ )
+ {
+ dpMoreColArithmetic[i] = dpArray[i/mnRowSize][i%mnRowSize];
+ }
+ if ( ocl_calc.getOpenclState())
+ {
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ ocl_calc.createMoreColArithmeticBuf64Bits( j * mnRowSize, mnOperatorCount );
+ ocl_calc.mapAndCopyMoreColArithmetic64Bits( dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount );
+ ocl_calc.oclMoreColHostArithmeticOperator64Bits( mnRowSize, mnOperatorCount, dpResult,mnRowSize );
+ }
+ else
+ {
+ ocl_calc.createMoreColArithmeticBuf32Bits( j* mnRowSize, mnOperatorCount );
+ ocl_calc.mapAndCopyMoreColArithmetic32Bits(dpMoreColArithmetic, mnRowSize * j, mnOperatorGroup, mnOperatorCount);
+ ocl_calc.oclMoreColHostArithmeticOperator32Bits( mnRowSize, mnOperatorCount, dpResult, mnRowSize );
+ }
+ }
+ }
+ else
+ {
+ return false;
+ }
+ return true;
+}
+
+class agency
+{
+public:
+ double *calculate(int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt);
};
-ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
+double * agency::calculate( int nOclOp,int rowSize,OclCalc &ocl_calc,uint *npOclStartPos,uint *npOclEndPos,FormulaGroupInterpreterOpenCL *formulaInterprt)
+{
+ const double *dpLeftData = NULL;
+ const double *dpRightData = NULL;
+ const double *dpOclSrcData=NULL;
+ if ( ocl_calc.gpuEnv.mnKhrFp64Flag == 1 || ocl_calc.gpuEnv.mnAmdFp64Flag == 1 )
+ {
+ switch( nOclOp )
+ {
+ case ocAdd:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset(rResult,0,rowSize);
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedAdd",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocSub:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = ( double * )malloc( sizeof(double) * nDataSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedSub",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData(rResult,nDataSize) );
+ break;
+ }
+ case ocMul:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedMul",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocDiv:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = ( double * )malloc( sizeof(double) * nDataSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostArithmeticStash64Bits( "oclSignedDiv",dpLeftData,dpRightData,rResult,temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocMax:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * rowSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
+ break;
+ }
+ case ocMin:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * rowSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostFormulaStash64Bits( "oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
+ break;
+ }
+ case ocAverage:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * rowSize );
+ memset( rResult,0,rowSize );
+ ocl_calc.oclHostFormulaStash64Bits( "oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,nDataSize,rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult,rowSize ) );
+ break;
+ }
+ default:
+ fprintf( stderr,"No OpenCL function for this calculation.\n" );
+ break;
+ }
+ }
+ else
+ {
+ switch( nOclOp )
+ {
+ case ocAdd:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset(rResult,0,rowSize);
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedAdd", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
+ break;
+ }
+ case ocSub:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedSub", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult,nDataSize ) );
+ break;
+ }
+ case ocMul:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc(sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedMul", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData( rResult, nDataSize ) );
+ break;
+ }
+ case ocDiv:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ SourceData *temp2 = formulaInterprt->srdDataPop();
+ nDataSize = temp2->getDataSize();
+ dpLeftData = temp2->getDouleData();
+ dpRightData = temp->getDouleData();
+ nDataSize = temp2->getDataSize();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostArithmeticStash32Bits( "oclSignedDiv", dpLeftData, dpRightData, rResult, temp->getDataSize() );
+ formulaInterprt->srdDataPush( new SourceData(rResult, nDataSize) );
+ break;
+ }
+ case ocMax:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc(sizeof(double) * nDataSize );
+ memset(rResult,0,rowSize);
+ ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMax", dpOclSrcData, npOclStartPos, npOclEndPos, rResult,nDataSize, rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult, rowSize ) );
+ break;
+ }
+ case ocMin:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize );
+ ocl_calc.oclHostFormulaStash32Bits( "oclFormulaMin", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
+ break;
+ }
+ case ocAverage:
+ {
+ unsigned int nDataSize = 0;
+ SourceData *temp = formulaInterprt->srdDataPop();
+ nDataSize = temp->getDataSize();
+ dpOclSrcData = temp->getDouleData();
+ double *rResult = NULL; // Point to the output data from GPU
+ rResult = (double *)malloc( sizeof(double) * nDataSize );
+ memset( rResult, 0, rowSize);
+ ocl_calc.oclHostFormulaStash32Bits( "oclFormulaAverage", dpOclSrcData, npOclStartPos, npOclEndPos, rResult, nDataSize, rowSize );
+ formulaInterprt->srdDataPush( new SourceData( rResult, rowSize) );
+ break;
+ }
+ default:
+ fprintf(stderr,"No OpenCL function for this calculation.\n");
+ break;
+ }
+ }
+ return NULL;
+}
+
+ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix( const ScMatrix& rMat )
{
SCSIZE nC, nR;
- rMat.GetDimensions(nC, nR);
- if (nC != nR || nC == 0)
+ rMat.GetDimensions( nC, nR );
+ if ( nC != nR || nC == 0 )
// Input matrix must be square. Return an empty matrix on failure and
// the caller will calculate it via CPU.
return ScMatrixRef();
@@ -70,29 +676,30 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
// the last, chained together in a single array.
std::vector<double> aDoubles;
rMat.GetDoubleArray(aDoubles);
+
float * fpOclMatrixSrc = NULL;
float * fpOclMatrixDst = NULL;
double * dpOclMatrixSrc = NULL;
double * dpOclMatrixDst = NULL;
uint nMatrixSize = nC * nR;
static OclCalc aOclCalc;
- if ( aOclCalc.GetOpenclState() )
+ if ( aOclCalc.getOpenclState() )
{
if ( aOclCalc.gpuEnv.mnKhrFp64Flag == 1 || aOclCalc.gpuEnv.mnAmdFp64Flag == 1 )
{
- aOclCalc.CreateBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
+ aOclCalc.createBuffer64Bits( dpOclMatrixSrc, dpOclMatrixDst, nMatrixSize );
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
dpOclMatrixSrc[i*nC+j] = aDoubles[j*nR+i];
- aOclCalc.OclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
+ aOclCalc.oclHostMatrixInverse64Bits( "oclFormulaMtxInv", dpOclMatrixSrc, dpOclMatrixDst,aDoubles, nR );
}
else
{
- aOclCalc.CreateBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
+ aOclCalc.createBuffer32Bits( fpOclMatrixSrc, fpOclMatrixDst, nMatrixSize );
for ( uint i = 0; i < nC; i++ )
for ( uint j = 0; j < nR; j++ )
fpOclMatrixSrc[i*nC+j] = (float) aDoubles[j*nR+i];
- aOclCalc.OclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
+ aOclCalc.oclHostMatrixInverse32Bits( "oclFormulaMtxInv", fpOclMatrixSrc, fpOclMatrixDst, aDoubles, nR );
}
}
@@ -111,374 +718,242 @@ ScMatrixRef FormulaGroupInterpreterOpenCL::inverseMatrix(const ScMatrix& rMat)
#else
// Another way is to put the values one column at a time.
const double* p = &aDoubles[0];
- for (SCSIZE i = 0; i < nC; ++i)
+ for( SCSIZE i = 0; i < nC; ++i )
{
- xInv->PutDouble(p, nR, i, 0);
+ xInv->PutDouble( p, nR, i, 0 );
p += nR;
}
#endif
return xInv;
}
-
-bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& rTopPos,
- const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode)
+bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc, const ScAddress& rTopPos,
+ const ScFormulaCellGroupRef& xGroup, ScTokenArray& rCode )
{
- generateRPNCode(rDoc, rTopPos, rCode);
-
- size_t rowSize = xGroup->mnLength;
- fprintf(stderr,"rowSize at begin is ...%ld.\n",(long)rowSize);
+ generateRPNCode( rDoc, rTopPos, rCode );
+ mnRowSize = xGroup->mnLength;
+ fprintf( stderr,"mnRowSize at begin is ...%ld.\n",(long)mnRowSize );
// The row quantity can be gotten from p2->GetArrayLength()
- uint nCount1 = 0, nCount2 = 0, nCount3 = 0;
int nOclOp = 0;
- double *rResult = NULL; // Point to the output data from GPU
- rResult = (double *)malloc(sizeof(double) * rowSize*2);// For 2 columns(B,C)
- if(NULL==rResult)
- {
- printf("malloc err\n");
- return false;
- }
- memset(rResult,0,rowSize);
- float * fpOclSrcData = NULL; // Point to the input data from CPU
- double * dpOclSrcData = NULL;
- uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100))
- uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100))
- float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData)
- float * fpRightData = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData)
- // The rightData can't be zero for "/"
- double * dpLeftData = NULL;
- double * dpRightData = NULL;
-
- float * fpSaveData=NULL; //It is a temp pointer point the preparing memory;
- float * fpSumProMergeLfData = NULL; //It merge the more col to one col is the left operator
- float * fpSumProMergeRtData = NULL; //It merge the more col to one col is the right operator
- double * dpSaveData=NULL;
- double * dpSumProMergeLfData = NULL;
- double * dpSumProMergeRtData = NULL;
- uint * npSumSize=NULL; //It is a array to save the matix sizt(col *row)
- int nSumproductSize=0; //It is the merge array size
- bool aIsAlloc=false; //It is a flag to judge the fpSumProMergeLfData existed
- unsigned int nCountMatix=0; //It is a count to save the calculate times
+ const double * dpOclSrcData = NULL;
+ const double * dpBinaryData = NULL;
static OclCalc ocl_calc;
- bool isSumProduct=false;
- if(ocl_calc.GetOpenclState())
+ unsigned int nSrcDataSize = 0;
+
+ const double *dpResult = NULL;
+ double *pResult = (double *)malloc(sizeof(double) * mnRowSize);
+ double *dpSvDouble = NULL;
+ bool isSample = false;
+
+ mnSingleCount = 0;
+ mnDoubleCount = 0;
+ mnSvDoubleCount = 0;
+ mnOperatorCount = 0;
+ mnPositonLen = 0;
+ if ( ocl_calc.getOpenclState() )
{
- // Don't know how large the size will be applied previously, so create them as the rowSize or 65536
- // Don't know which formulae will be used previously, so create buffers for different formulae used probably
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+ getPosition(rCode,xGroup,mnRowSize,mnpOclStartPos,mnpOclEndPos,&mnPositonLen);
+ const formula::FormulaToken* p = rCode.FirstRPN();
+
+ bool isSingle = false;
+ int nCountNum=0;
+ do
{
- ocl_calc.CreateBuffer64Bits(dpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
- ocl_calc.CreateBuffer64Bits(dpLeftData,dpRightData,rowSize);
- }
- else
+ if ( ocPush != p->GetOpCode())
+ {
+ nOclOp = p->GetOpCode();
+ mnOperatorGroup[mnOperatorCount++] = nOclOp;
+ }
+ else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+ {
+ mnSingleCount++;
+ }
+ if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+ {
+ mnSvDoubleCount++;
+ }
+ } while ( NULL != ( p = rCode.NextRPN() ) );
+ if( isGroundWater() )
{
- ocl_calc.CreateBuffer32Bits(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize);
- ocl_calc.CreateBuffer32Bits(fpLeftData,fpRightData,rowSize);
+ isSample = true;
}
- //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos);
- }
-///////////////////////////////////////////////////////////////////////////////////////////
-
- // Until we implement group calculation for real, decompose the group into
- // individual formula token arrays for individual calculation.
- ScAddress aTmpPos = rTopPos;
- for (sal_Int32 i = 0; i < xGroup->mnLength; ++i)
- {
- aTmpPos.SetRow(xGroup->mnStart + i);
- ScTokenArray aCode2;
- for (const formula::FormulaToken* p = rCode.First(); p; p = rCode.Next())
+ mnOperatorCount = 0;
+ mnSingleCount = 0;
+ mnSvDoubleCount = 0;
+ p = rCode.FirstRPN();
+ if(isSample)
{
- switch (p->GetType())
+ do
{
- case formula::svSingleVectorRef:
+ if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
{
- const formula::SingleVectorRefToken* p2 = static_cast<const formula::SingleVectorRefToken*>(p);
- const double* pArray = p2->GetArray();
- aCode2.AddDouble(static_cast<size_t>(i) < p2->GetArrayLength() ? pArray[i] : 0.0);
+ mdpSvdouble[mnSvDoubleCount++] = p->GetDouble();
}
- break;
- case formula::svDoubleVectorRef:
+ else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
{
- const formula::DoubleVectorRefToken* p2 = static_cast<const formula::DoubleVectorRefToken*>(p);
- const std::vector<const double*>& rArrays = p2->GetArrays();
- size_t nColSize = rArrays.size();
- size_t nRowStart = p2->IsStartFixed() ? 0 : i;
- size_t nRowEnd = p2->GetRefRowSize() - 1;
- if (!p2->IsEndFixed())
- nRowEnd += i;
- size_t nRowSize = nRowEnd - nRowStart + 1;
- //store the a matix`s rowsize and colsize,use it to calculate the matix`s size
- ocl_calc.nFormulaRowSize = nRowSize;
- ocl_calc.nFormulaColSize = nColSize;
- ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0));
- if(ocl_calc.GetOpenclState())
+ const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
+ const std::vector< const double* >& rArrays = pDvr->GetArrays();
+ uint rArraysSize = rArrays.size();
+ int nMoreColSize = 0;
+ DoubleVectorFormula *SvDoubleTemp = new DoubleVectorFormula();
+ if( rArraysSize > 1 )
{
- npOclStartPos[i] = nRowStart; // record the start position
- npOclEndPos[i] = nRowEnd; // record the end position
- }
- int nTempOpcode;
- const formula::FormulaToken* pTemp = p;
- pTemp=aCode2.Next();
- nTempOpcode=pTemp->GetOpCode();
- while(1)
- {
- nTempOpcode=pTemp->GetOpCode();
- if(nTempOpcode!=ocOpen && nTempOpcode!=ocPush)
- break;
- pTemp=aCode2.Next();
- }
- if((!aIsAlloc) && (ocl_calc.GetOpenclState())&& (nTempOpcode == ocSumProduct))
- {
- //nColSize * rowSize is the data size , but except the the head of data will use less the nRowSize
- //the other all use nRowSize times . and it must aligen so add nRowSize-1.
- nSumproductSize = nRowSize+nColSize * rowSize*nRowSize-1;
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
- ocl_calc.CreateBuffer64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
- else
- ocl_calc.CreateBuffer32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,nSumproductSize,rowSize);
- aIsAlloc = true;
- isSumProduct=true;
- }
- if(isSumProduct)
- {
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
- {
- if(nCountMatix%2==0)
- dpSaveData = dpSumProMergeLfData;
- else
- dpSaveData = dpSumProMergeRtData;
- }
- else
+ double *dpMoreColData = NULL;
+ for ( uint loop=0; loop < rArraysSize; loop++ )
{
- if(nCountMatix%2==0)
- fpSaveData = fpSumProMergeLfData;
- else
- fpSaveData = fpSumProMergeRtData;
- }
- }
- for (size_t nCol = 0; nCol < nColSize; ++nCol)
- {
- const double* pArray = rArrays[nCol];
- if( NULL==pArray )
- {
- fprintf(stderr,"Error: pArray is NULL!\n");
- free(rResult);
- return false;
- }
- if(ocl_calc.GetOpenclState())
- {
- for( size_t u=nRowStart; u<=nRowEnd; u++ )
- {
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
- {
- dpOclSrcData[u] = pArray[u];
- //fprintf(stderr,"dpOclSrcData[%d] is %f.\n",u,dpOclSrcData[u]);
- if(isSumProduct)
- dpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = pArray[u];
- }
- else
- {
- // Many video cards can't support double type in kernel, so need transfer the double to float
- fpOclSrcData[u] = (float)pArray[u];
- //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]);
- if(isSumProduct)
- fpSaveData[u+nRowSize*nCol + nRowStart* nColSize * nRowSize-nRowStart] = (float)pArray[u];
- }
- }
- }
-
- for (size_t nRow = 0; nRow < nRowSize; ++nRow)
- {
- if (nRowStart + nRow < p2->GetArrayLength())
+ dpOclSrcData = rArrays[loop];
+ nSrcDataSize = pDvr->GetArrayLength();
+ nMoreColSize += nSrcDataSize;
+ dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
+ for ( uint j = nMoreColSize - nSrcDataSize, i = 0; i < nSrcDataSize; i++, j++ )
{
- double fVal = pArray[nRowStart+nRow];
- pMat->PutDouble(fVal, nCol, nRow);
+ dpMoreColData[j] = dpOclSrcData[i];
}
}
+ dpOclSrcData = dpMoreColData;
+ nSrcDataSize = nMoreColSize;
}
-
- ScMatrixToken aTok(pMat);
- aCode2.AddToken(aTok);
- if(isSumProduct)
+ else
{
- npSumSize[nCountMatix/2] =nRowSize*nColSize;
- nCountMatix++;
+ dpOclSrcData = rArrays[0];
+ nSrcDataSize = pDvr->GetArrayLength();
+ SvDoubleTemp->mdpInputData = dpOclSrcData;
+ SvDoubleTemp->mnInputDataSize = nSrcDataSize;
+ SvDoubleTemp->mnInputStartPosition = mnpOclStartPos[nCountNum*mnRowSize];
+ SvDoubleTemp->mnInputEndPosition = mnpOclEndPos[nCountNum*mnRowSize];
+ SvDoubleTemp->mnInputStartOffset = mnpOclStartPos[nCountNum*mnRowSize+1]-mnpOclStartPos[nCountNum*mnRowSize];
+ SvDoubleTemp->mnInputEndOffset = mnpOclEndPos[nCountNum*mnRowSize+1]-mnpOclEndPos[nCountNum*mnRowSize];
+ mDoubleArray[mnDoubleCount++] = SvDoubleTemp;
+ nCountNum++;
}
}
- break;
- default:
- aCode2.AddToken(*p);
- }
- }
-
- ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos);
- if (!pDest)
- {
- free(rResult);
- return false;
+ else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+ {
+ const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
+ dpBinaryData = pSvr->GetArray();
+ uint nArrayLen = pSvr->GetArrayLength();
+ SingleVectorFormula *SignleTemp = new SingleVectorFormula() ;
+ if(isSingle)
+ {
+ SignleTemp = mSingleArray[--mnSingleCount];
+ SignleTemp->mdpInputRightData = dpBinaryData;
+ SignleTemp->mnInputRightDataSize = nArrayLen;
+ SignleTemp->mnInputRightStartPosition = 0;
+ SignleTemp->mnInputRightOffset = 0;
+ isSingle = false;
+ }
+ else
+ {
+ SignleTemp = new SingleVectorFormula();
+ SignleTemp->mdpInputLeftData = dpBinaryData;
+ SignleTemp->mnInputLeftDataSize = nArrayLen;
+ SignleTemp->mdpInputRightData = NULL;
+ SignleTemp->mnInputRightDataSize = 0;
+ SignleTemp->mnInputLeftStartPosition = 0;
+ SignleTemp->mnInputLeftOffset = 0;
+ isSingle = true;
+ }
+ mSingleArray[mnSingleCount++] = SignleTemp;
+ }
+ else
+ {
+ nOclOp = p->GetOpCode();
+ mnOperatorGroup[mnOperatorCount++] = nOclOp;
+ }
+ } while ( NULL != ( p = rCode.NextRPN() ) );
+ if ( !chooseFunction( ocl_calc, pResult ) )
+ return false;
+ else
+ dpResult = pResult;
}
- if(ocl_calc.GetOpenclState())
+ else
{
- const formula::FormulaToken *pCur = aCode2.First();
- aCode2.Reset();
- while( ( pCur = aCode2.Next() ) != NULL )
+ agency aChooseAction;
+
+ do
{
- OpCode eOp = pCur->GetOpCode();
- if(eOp==0)
+ if ( ocPush == p->GetOpCode() && formula::svDouble == p->GetType() )
+ {
+ dpSvDouble = (double *) malloc( sizeof(double ) * mnRowSize );
+ double dTempValue = p->GetDouble();
+ for ( uint i = 0; i < mnRowSize; i++ )
+ dpSvDouble[i] = dTempValue;
+ srdDataPush( new SourceData( dpSvDouble, mnRowSize ) );
+ collectDoublePointers( dpSvDouble );
+ }
+ else if( ocPush == p->GetOpCode() && formula::svDoubleVectorRef == p->GetType())
{
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+ const formula::DoubleVectorRefToken* pDvr = static_cast< const formula::DoubleVectorRefToken* >( p );
+ const std::vector< const double* >& rArrays = pDvr->GetArrays();
+ unsigned int rArraysSize = rArrays.size();
+ int nMoreColSize = 0;
+ if(rArraysSize > 1)
{
- if(nCount3%2==0)
- dpLeftData[nCount1++] = pCur->GetDouble();
- else
- dpRightData[nCount2++] = pCur->GetDouble();
- nCount3++;
+ double *dpMoreColData = NULL;
+ for( uint loop=0; loop < rArraysSize; loop++ )
+ {
+ dpOclSrcData = rArrays[loop];
+ nSrcDataSize = pDvr->GetArrayLength();
+ nMoreColSize += nSrcDataSize;
+ dpMoreColData = (double *) realloc(dpMoreColData,nMoreColSize * sizeof(double));
+ for(uint j=nMoreColSize-nSrcDataSize,i=0;i<nSrcDataSize;i++,j++)
+ {
+ dpMoreColData[j] = dpOclSrcData[i];
+ }
+ }
+ dpOclSrcData = dpMoreColData;
+ nSrcDataSize = nMoreColSize;
+ collectDoublePointers( dpMoreColData );
}
else
{
- if(nCount3%2==0)
- fpLeftData[nCount1++] = (float)pCur->GetDouble();
- else
- fpRightData[nCount2++] = (float)pCur->GetDouble();
- nCount3++;
+ dpOclSrcData = rArrays[0];
+ nSrcDataSize = pDvr->GetArrayLength();
}
+ srdDataPush( new SourceData( dpOclSrcData,nSrcDataSize,rArraysSize ) );
}
- else if( eOp!=ocOpen && eOp!=ocClose &&eOp != ocSep)
- nOclOp = eOp;
-
-// if(count1>0){//dbg
-// fprintf(stderr,"leftData is %f.\n",fpLeftData[count1-1]);
-// count1--;
-// }
-// if(count2>0){//dbg
-// fprintf(stderr,"rightData is %f.\n",fpRightData[count2-1]);
-// count2--;
-// }
- }
- }
-
- if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState())
- {
- //fprintf(stderr,"ccCPU flow...\n\n");
- generateRPNCode(rDoc, aTmpPos, aCode2);
- ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2);
- aInterpreter.Interpret();
- pDest->SetResultToken(aInterpreter.GetResultToken().get());
- pDest->ResetDirty();
- pDest->SetChanged(true);
+ else if( ocPush == p->GetOpCode() && formula::svSingleVectorRef == p->GetType() )
+ {
+ const formula::SingleVectorRefToken* pSvr = static_cast<const formula::SingleVectorRefToken*>( p );
+ dpBinaryData = pSvr->GetArray();
+ nSrcDataSize = pSvr->GetArrayLength();
+ srdDataPush( new SourceData( dpBinaryData, nSrcDataSize ) );
+ }
+ else
+ {
+ nOclOp = p->GetOpCode();
+ aChooseAction.calculate(nOclOp,mnRowSize,ocl_calc,mnpOclStartPos,mnpOclEndPos,this);
+ mnSingleCount = 0;
+ mnDoubleCount = 0;
+ mnSvDoubleCount = 0;
+ mnOperatorCount = 0;
+ mnPositonLen = 0;
+ }
+ } while ( NULL != ( p = rCode.NextRPN() ) );
+ SourceData * sResult = srdDataPop();
+ dpResult = sResult->getDouleData();
}
- } // for loop end (xGroup->mnLength)
-
- // For GPU calculation
- if(getenv("SC_GPU")&&ocl_calc.GetOpenclState())
- {
- fprintf(stderr,"ggGPU flow...\n\n");
- printf(" oclOp is... %d\n",nOclOp);
- osl_getSystemTime(&aTimeBefore); //timer
- if(ocl_calc.gpuEnv.mnKhrFp64Flag==1 || ocl_calc.gpuEnv.mnAmdFp64Flag==1)
+ rDoc.SetFormulaResults( rTopPos, dpResult, mnRowSize );
+ freeDoublePointers();
+ if ( pResult )
{
- fprintf(stderr,"ggGPU double precision flow...\n\n");
- //double precision
- switch(nOclOp)
- {
- case ocAdd:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedAdd",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocSub:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedSub",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocMul:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedMul",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocDiv:
- ocl_calc.OclHostArithmeticOperator64Bits("oclSignedDiv",dpLeftData,dpRightData,rResult,nCount1);
- break;
- case ocMax:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMax",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocMin:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaMin",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocAverage:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaAverage",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSum:
- ocl_calc.OclHostFormulaStatistics64Bits("oclFormulaSum",dpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocCount:
- ocl_calc.OclHostFormulaCount64Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSumProduct:
- ocl_calc.OclHostFormulaSumProduct64Bits(dpSumProMergeLfData,dpSumProMergeRtData,npSumSize,rResult,rowSize);
- break;
- default:
- fprintf(stderr,"No OpenCL function for this calculation.\n");
- break;
- }
+ free( pResult );
+ pResult = NULL;
}
- else
+ if ( mnpOclStartPos )
{
- fprintf(stderr,"ggGPU float precision flow...\n\n");
- //float precision
- switch(nOclOp)
- {
- case ocAdd:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedAdd",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocSub:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedSub",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocMul:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedMul",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocDiv:
- ocl_calc.OclHostArithmeticOperator32Bits("oclSignedDiv",fpLeftData,fpRightData,rResult,nCount1);
- break;
- case ocMax:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMax",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocMin:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaMin",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocAverage:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaAverage",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSum:
- ocl_calc.OclHostFormulaStatistics32Bits("oclFormulaSum",fpOclSrcData,npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocCount:
- ocl_calc.OclHostFormulaCount32Bits(npOclStartPos,npOclEndPos,rResult,rowSize);
- break;
- case ocSumProduct:
- ocl_calc.OclHostFormulaSumProduct32Bits(fpSumProMergeLfData,fpSumProMergeRtData,npSumSize,rResult,rowSize);
- break;
- default:
- fprintf(stderr,"No OpenCL function for this calculation.\n");
- break;
- }
+ free( mnpOclStartPos );
+ mnpOclStartPos = NULL;
}
-
- /////////////////////////////////////////////////////
- osl_getSystemTime(&aTimeAfter);
- double diff = getTimeDiff(aTimeAfter, aTimeBefore);
- //if (diff >= 1.0)
+ if ( mnpOclEndPos )
{
- fprintf(stderr,"OpenCL,diff...%f.\n",diff);
+ free( mnpOclEndPos );
+ mnpOclEndPos = NULL;
}
-/////////////////////////////////////////////////////
-
-//rResult[i];
-// for(sal_Int32 i = 0; i < rowSize; ++i){//dbg output results
-// fprintf(stderr,"After GPU,rRsults[%d] is ...%f\n",i,rResult[i]);
-// }
-
- // Insert the double data, in rResult[i] back into the document
- rDoc.SetFormulaResults(rTopPos, rResult, xGroup->mnLength);
- }
-
- free(rResult);
-
- return true;
+ return true;
+ } // getOpenclState() End
+ else
+ return false;
}
/// Special case of formula compiler for groundwatering
@@ -489,11 +964,11 @@ public:
FormulaGroupInterpreterSoftware()
{
fprintf(stderr,"\n\n ***** Groundwater Backend *****\n\n\n");
- OclCalc::InitEnv();
+ OclCalc::initEnv();
}
virtual ~FormulaGroupInterpreterGroundwater()
{
- OclCalc::ReleaseOpenclRunEnv();
+ OclCalc::releaseOpenclRunEnv();
}
virtual ScMatrixRef inverseMatrix(const ScMatrix& /* rMat */) { return ScMatrixRef(); }
@@ -569,7 +1044,7 @@ bool FormulaGroupInterpreterGroundwater::interpretCL(ScDocument& rDoc, const ScA
fprintf (stderr, "Calculate !");
- double *pResult = ocl_calc.OclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
+ double *pResult = ocl_calc.oclSimpleDeltaOperation( eOp, pGroundWaterDataArray,
pArrayToSubtractOneElementFrom,
(size_t) xGroup->mnLength, delta );
RETURN_IF_FAIL(pResult != NULL, "buffer alloc / calculaton failed");
diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx
index bcd7db0..53917b3 100644
--- a/sc/source/core/opencl/oclkernels.hxx
+++ b/sc/source/core/opencl/oclkernels.hxx
@@ -7,8 +7,8 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
-#ifndef _OCL_KERNEL_H_
-#define _OCL_KERNEL_H_
+#ifndef SC_OCLKERNELS_HXX
+#define SC_OCLKERNELS_HXX
#ifndef USE_EXTERNAL_KERNEL
#define KERNEL( ... )# __VA_ARGS__
@@ -24,6 +24,97 @@ const char *kernel_src = KERNEL(
\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
\n#else\n
\n#endif\n
+inline fp_t oclAverage( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+ uint start = startArray[id];
+ uint end = endArray[id];
+ fp_t fSum = 0.0;
+ fp_t zero[16] = {0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f,0.0f};
+ fp_t16 vSum=vload16(0,zero);
+ fp_t16 ptr;
+ __global fp_t *p = values;
+ p+= start;
+
+ for(int i = 0; i < (end - start + 1)/16; ++i)
+ {
+ ptr=vload16(0,p);
+ vSum += ptr;
+ p+=16;
+ }
+ int lastData = (end-start+1)%16;
+ for(int i = 0; i <lastData; i++)
+ {
+ fSum += *p;
+ p+=1;
+ }
+ vSum.s01234567 = vSum.s01234567+vSum.s89abcdef;
+ vSum.s0123 = vSum.s0123+vSum.s4567;
+ vSum.s01 = vSum.s01+vSum.s23;
+ vSum.s0 = vSum.s0+vSum.s1;
+ fSum = vSum.s0+fSum;
+ fp_t fVal = fSum/(end-start+1);
+ return fVal;
+}
+inline fp_t oclMax( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+ uint start = startArray[id];
+ uint end = endArray[id];
+ fp_t fMax = values[start];
+ fp_t zero[16] = {fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax,fMax};
+ fp_t16 vMax=vload16(0,zero);
+ //Max
+ fp_t16 ptr;
+ __global fp_t *p = values;
+ p+= start;
+ for(int i = 0; i < (end - start + 1)/16; ++i)
+ {
+ ptr=vload16(0,p);
+ vMax = fmax(vMax,ptr);
+ p+=16;
+ }
+ int lastData = (end-start+1)%16;
+ for(int i = 0; i <lastData; i++)
+ {
+ fMax = fmax(fMax,*p);
+ p+=1;
+ }
+ vMax.s01234567 = fmax(vMax.s01234567, vMax.s89abcdef);
+ vMax.s0123 = fmax(vMax.s0123, vMax.s4567);
+ vMax.s01 = fmax(vMax.s01, vMax.s23);
+ vMax.s0 = fmax(vMax.s0, vMax.s1);
+ fMax = fmax(vMax.s0, fMax);
+ return fMax;
+}
+inline fp_t oclMin( const uint id,__global fp_t *values,__global uint *startArray,__global uint *endArray)
+{
+ uint start = startArray[id];
+ uint end = endArray[id];
+ fp_t fMin = values[start];
+ fp_t zero[16] = {fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin,fMin};
+ fp_t16 vMin=vload16(0,zero);
+ //Min
+ fp_t16 ptr;
+ __global fp_t *p = values;
+ p+= start;
+ for(int i = 0; i < (end - start + 1)/16; ++i)
+ {
+ ptr=vload16(0,p);
+ vMin = fmin(vMin,ptr);
+ p+=16;
+ }
+ int lastData = (end-start+1)%16;
+ for(int i = 0; i <lastData; i++)
+ {
+ fMin = fmin(fMin,*p);
+ p+=1;
+ }
+ vMin.s01234567 = fmin(vMin.s01234567, vMin.s89abcdef);
+ vMin.s0123 = fmin(vMin.s0123, vMin.s4567);
+ vMin.s01 = fmin(vMin.s01, vMin.s23);
+ vMin.s0 = fmin(vMin.s0, vMin.s1);
+ fMin = fmin(vMin.s0, fMin);
+ return fMin;
+}
__kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
@@ -31,7 +122,6 @@ __kernel void oclSignedAdd(__global fp_t *ltData,__global fp_t *rtData,__global
otData[id] = ltData[id] + rtData[id];
}
-
__kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
const unsigned int id = get_global_id(0);
@@ -41,39 +131,31 @@ __kernel void oclSignedSub(__global fp_t *ltData,__global fp_t *rtData,__global
__kernel void oclSignedMul(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
int id = get_global_id(0);
- otData[id] =ltData[id] * rtData[id];
+ otData[id] = ltData[id] * rtData[id];
}
__kernel void oclSignedDiv(__global fp_t *ltData,__global fp_t *rtData,__global fp_t *otData)
{
const unsigned int id = get_global_id(0);
- otData[id] = ltData[id] / rtData[id];
+ fp_t divisor = rtData[id];
+ if ( divisor != 0 )
+ otData[id] = ltData[id] / divisor;
+ else
+ otData[id] = 0.0;
}
__kernel void oclFormulaMin(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{
const unsigned int id = get_global_id(0);
- unsigned int startFlag = start[id];
- unsigned int endFlag = end[id];
- fp_t fMinVal = input[startFlag];
- for(int i=startFlag;i<=endFlag;i++)
- {
- fMinVal = fmin( fMinVal, input[i] );
- }
- output[id] = fMinVal;
+ fp_t fVal = oclMin(id,input,start,end);
+ output[id] = fVal ;
}
__kernel void oclFormulaMax(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{
const unsigned int id = get_global_id(0);
- unsigned int startFlag = start[id];
- unsigned int endFlag = end[id];
- fp_t fMaxVal = input[startFlag];
- for ( int i = startFlag; i <= endFlag; i++ )
- {
- fMaxVal = fmax( fMaxVal, input[i] );
- }
- output[id] = fMaxVal;
+ fp_t fVal = oclMax(id,input,start,end);
+ output[id] = fVal ;
}
//Sum
__kernel void oclFormulaSum(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
@@ -94,12 +176,10 @@ __kernel void oclFormulaCount(__global uint *start,__global uint *end,__global f
__kernel void oclFormulaAverage(__global fp_t *input,__global uint *start,__global uint *end,__global fp_t *output)
{
const unsigned int id = get_global_id(0);
- fp_t sum=0.0;
- for(int i = start[id];i<=end[id];i++)
- sum += input[i];
- output[id] = sum / (end[id]-start[id]+1);
-}
+ fp_t fVal = oclAverage(id,input,start,end);
+ output[id] = fVal ;
+}
//Sumproduct
__kernel void oclFormulaSumproduct(__global fp_t *firstCol,__global uint* npSumSize,__global fp_t *output,uint nMatixSize)
{
@@ -147,7 +227,7 @@ __kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint s
// Min
fp_t fMinVal = values[start];
- for(int i=start+1;i < end;i++)
+ for ( int i = start + 1; i < end; i++ )
{
if(values[i]<fMinVal)
fMinVal = values[i];
@@ -177,14 +257,14 @@ __kernel void oclFormulaMtxInv(__global fp_t * fpMatrixInput, __global fp_t * fp
fpP[nOffset*nDimension+nId] = fpP[nMax*nDimension+nId];
fpP[nMax*nDimension+nId] = dMovebuffer;
}
-__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY)
+__kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMatrixOutput,__global fp_t * fpP,__global fp_t * fpY,__global uint* npDim)
{
int nId = get_global_id(0);
- int nDimension = get_global_size(0);
-
+ int nDimension = npDim[nId];
+ fp_t fsum = 0.0;
for ( int yi=0; yi < nDimension; yi++ )
{
- fp_t fsum = 0.0;
+ fsum = 0.0;
for ( int yj=0; yj < nDimension; yj++ )
{
fsum += fpMatrixInput[yi*nDimension+yj] * fpY[nId+yj*nDimension];
@@ -194,7 +274,7 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
}
for ( int xi = nDimension - 1; xi >= 0; xi-- )
{
- fp_t fsum = 0.0;
+ fsum = 0.0;
for ( int xj = 0; xj < nDimension; xj++ )
{
fsum += fpMatrixInput[xi*nDimension+xj] * fpMatrixOutput[nId+nDimension*xj];
@@ -203,6 +283,101 @@ __kernel void oclMatrixSolve(__global fp_t * fpMatrixInput,__global fp_t * fpMat
}
}
+__kernel void oclAverageAdd(__global fp_t *values,__global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ output[id] = fVal + addend[id];
+}
+
+__kernel void oclAverageSub(__global fp_t *values,__global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ output[id] = fVal - subtract[id];
+}
+
+__kernel void oclAverageMul(__global fp_t *values,__global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ output[id] = fVal * multiplier[id];
+}
+__kernel void oclAverageDiv(__global fp_t *values,__global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fVal = oclAverage(id,values,startArray,endArray);
+ fp_t divisor = div[id];
+ if ( divisor != 0 )
+ output[id] = fVal / divisor;
+ else
+ output[id] = 0.0;
+}
+
+__kernel void oclMinAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ output[id] = fMin + addend[id];
+}
+
+__kernel void oclMinSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ output[id] = fMin - subtract[id];
+}
+__kernel void oclMinMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ output[id] = fMin * multiplier[id];
+}
+__kernel void oclMinDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMin = oclMin(id,values,startArray,endArray);
+ fp_t divisor = div[id];
+ if ( divisor != 0 )
+ output[id] = fMin / divisor;
+ else
+ output[id] = 0.0;
+}
+__kernel void oclMaxAdd(__global fp_t *values, __global fp_t *addend, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ output[id] = fMax + addend[id];
+}
+
+__kernel void oclMaxSub(__global fp_t *values, __global fp_t *subtract, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ output[id] = fMax - subtract[id];
+}
+__kernel void oclMaxMul(__global fp_t *values, __global fp_t *multiplier, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ output[id] = fMax * multiplier[id];
+}
+__kernel void oclMaxDiv(__global fp_t *values, __global fp_t *div, __global uint *startArray, __global uint *endArray, __global fp_t *output)
+{
+ const unsigned int id = get_global_id(0);
+ fp_t fMax = oclMax(id,values,startArray,endArray);
+ fp_t divisor = div[id];
+ if ( divisor != 0 )
+ output[id] = fMax / divisor;
+ else
+ output[id] = 0.0;
+}
+
+__kernel void oclSub( fp_t ltData, __global fp_t *rtData, __global fp_t *outData )
+{
+ const unsigned int id = get_global_id(0);
+ outData[id] = ltData - rtData[id];
+}
);
#endif // USE_EXTERNAL_KERNEL
diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx
index 47b906b..6db498b 100644
--- a/sc/source/core/opencl/openclwrapper.cxx
+++ b/sc/source/core/opencl/openclwrapper.cxx
@@ -7,19 +7,19 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
-#include "openclwrapper.hxx"
-
+#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cmath>
#include "sal/config.h"
#include "random.hxx"
+#include "openclwrapper.hxx"
#include "oclkernels.hxx"
#ifdef SAL_WIN32
#include <Windows.h>
#endif
-//#define USE_KERNEL_FILE
-
+//#define USE_MAP_BUFFER
+using namespace std;
GPUEnv OpenclDevice::gpuEnv;
int OpenclDevice::isInited =0;
@@ -44,9 +44,9 @@ int OpenclDevice::isInited =0;
HINSTANCE HOpenclDll = NULL;
void * OpenclDll = NULL;
-int OpenclDevice::LoadOpencl()
+int OpenclDevice::loadOpencl()
{
- //fprintf(stderr, " LoadOpenclDllxx... \n");
+ //fprintf(stderr, " loadOpenclDllxx... \n");
OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
OpenclDll = LoadLibrary( OPENCL_DLL_NAME );
if ( !static_cast<HINSTANCE>( OpenclDll ) )
@@ -59,7 +59,7 @@ int OpenclDevice::LoadOpencl()
return OCLSUCCESS;
}
-void OpenclDevice::FreeOpenclDll()
+void OpenclDevice::freeOpenclDll()
{
fprintf(stderr, " Free opencllo.dll ... \n");
if ( !static_cast<HINSTANCE>( OpenclDll ) )
@@ -67,39 +67,39 @@ void OpenclDevice::FreeOpenclDll()
}
#endif
-int OpenclDevice::InitEnv()
+int OpenclDevice::initEnv()
{
#ifdef SAL_WIN32
while( 1 )
{
- if( 1 == LoadOpencl() )
+ if( 1 == loadOpencl() )
break;
}
#endif
- InitOpenclRunEnv( 0 );
+ initOpenclRunEnv( 0 );
return 1;
}
-int OpenclDevice::ReleaseOpenclRunEnv()
+int OpenclDevice::releaseOpenclRunEnv()
{
- ReleaseOpenclEnv( &gpuEnv );
+ releaseOpenclEnv( &gpuEnv );
#ifdef SAL_WIN32
- FreeOpenclDll();
+ freeOpenclDll();
#endif
return 1;
}
///////////////////////////////////////////////////////
///////////////////////////////////////////////////////
-inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
+inline int OpenclDevice::addKernelConfig( int kCount, const char *kName )
{
if ( kCount < 1 )
- fprintf(stderr,"Error: ( KCount < 1 )" SAL_DETAIL_WHERE "AddKernelConfig\n" );
+ fprintf(stderr,"Error: ( KCount < 1 )" SAL_DETAIL_WHERE "addKernelConfig\n" );
strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
gpuEnv.mnKernelCount++;
return 0;
}
-int OpenclDevice::RegistOpenclKernel()
+int OpenclDevice::registOpenclKernel()
{
if ( !gpuEnv.mnIsUserCreated )
memset( &gpuEnv, 0, sizeof(gpuEnv) );
@@ -107,37 +107,58 @@ int OpenclDevice::RegistOpenclKernel()
gpuEnv.mnFileCount = 0; //argc;
gpuEnv.mnKernelCount = 0UL;
- AddKernelConfig( 1, (const char*) "oclFormulaMin" );
- AddKernelConfig( 2, (const char*) "oclFormulaMax" );
- AddKernelConfig( 3, (const char*) "oclFormulaSum" );
- AddKernelConfig( 4, (const char*) "oclFormulaCount" );
- AddKernelConfig( 5, (const char*) "oclFormulaAverage" );
- AddKernelConfig( 6, (const char*) "oclFormulaSumproduct" );
- AddKernelConfig( 7, (const char*) "oclFormulaMtxInv" );
-
- AddKernelConfig( 8, (const char*) "oclSignedAdd" );
- AddKernelConfig( 9, (const char*) "oclSignedSub" );
- AddKernelConfig( 10, (const char*) "oclSignedMul" );
- AddKernelConfig( 11, (const char*) "oclSignedDiv" );
- AddKernelConfig( 12, (const char*) "oclAverageDelta" );
- AddKernelConfig( 13, (const char*) "oclMaxDelta" );
- AddKernelConfig( 14, (const char*) "oclMinDelta" );
- AddKernelConfig( 15, (const char*) "oclSubDelta" );
- AddKernelConfig( 16, (const char*) "oclLUDecomposition" );
+ addKernelConfig( 1, (const char*) "oclFormulaMin" );
+ addKernelConfig( 2, (const char*) "oclFormulaMax" );
+ addKernelConfig( 3, (const char*) "oclFormulaSum" );
+ addKernelConfig( 4, (const char*) "oclFormulaCount" );
+ addKernelConfig( 5, (const char*) "oclFormulaAverage" );
+ addKernelConfig( 6, (const char*) "oclFormulaSumproduct" );
+ addKernelConfig( 7, (const char*) "oclFormulaMtxInv" );
+
+ addKernelConfig( 8, (const char*) "oclSignedAdd" );
+ addKernelConfig( 9, (const char*) "oclSignedSub" );
+ addKernelConfig( 10, (const char*) "oclSignedMul" );
+ addKernelConfig( 11, (const char*) "oclSignedDiv" );
+ addKernelConfig( 12, (const char*) "oclAverageDelta" );
+ addKernelConfig( 13, (const char*) "oclMaxDelta" );
+ addKernelConfig( 14, (const char*) "oclMinDelta" );
+ addKernelConfig( 15, (const char*) "oclSubDelta" );
+ addKernelConfig( 16, (const char*) "oclLUDecomposition" );
+ addKernelConfig( 17, (const char*) "oclAverageDeltaRPN" );
+ addKernelConfig( 18, (const char*) "oclMaxDeltaRPN" );
+ addKernelConfig( 19, (const char*) "oclMinDeltaRPN" );
+ addKernelConfig( 20, (const char*) "oclMoreColArithmeticOperator" );
+ addKernelConfig( 21, (const char*) "oclColumnH" );
+ addKernelConfig( 22, (const char*) "oclColumnL" );
+ addKernelConfig( 23, (const char*) "oclColumnN" );
+ addKernelConfig( 24, (const char*) "oclColumnJ" );
+ addKernelConfig( 25, (const char*) "oclMaxSub" );
+ addKernelConfig( 26, (const char*) "oclAverageSub" );
+ addKernelConfig( 27, (const char*) "oclMinSub" );
+ addKernelConfig( 28, (const char*) "oclMaxAdd" );
+ addKernelConfig( 29, (const char*) "oclAverageAdd" );
+ addKernelConfig( 30, (const char*) "oclMinAdd" );
+ addKernelConfig( 31, (const char*) "oclMaxMul" );
+ addKernelConfig( 32, (const char*) "oclAverageMul" );
+ addKernelConfig( 33, (const char*) "oclMinMul" );
+ addKernelConfig( 34, (const char*) "oclMaxDiv" );
+ addKernelConfig( 35, (const char*) "oclAverageDiv" );
+ addKernelConfig( 36, (const char*) "oclMinDiv" );
+ addKernelConfig( 37, (const char*) "oclSub" );// for svDouble type
return 0;
}
OpenclDevice::OpenclDevice()
{
- //InitEnv();
+ //initEnv();
}
OpenclDevice::~OpenclDevice()
{
- //ReleaseOpenclRunEnv();
+ //releaseOpenclRunEnv();
}
-int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
+int OpenclDevice::setKernelEnv( KernelEnv *envInfo )
{
envInfo->mpkContext = gpuEnv.mpContext;
envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
@@ -146,12 +167,12 @@ int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
return 1;
}
-int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName )
+int OpenclDevice::checkKernelName( KernelEnv *envInfo, const char *kernelName )
{
- //printf("CheckKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
+ //printf("checkKernelName,total count of kernels...%d\n", gpuEnv.kernelCount);
int kCount;
int nFlag = 0;
- for ( kCount = 0; kCount < gpuEnv.mnKernelCount; kCount++ )
+ for ( kCount=0; kCount < gpuEnv.mnKernelCount; kCount++ )
{
if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[kCount]) == 0 )
{
@@ -174,7 +195,7 @@ int OpenclDevice::CheckKernelName( KernelEnv *envInfo, const char *kernelName )
return 1;
}
-int OpenclDevice::ConvertToString( const char *filename, char **source )
+int OpenclDevice::convertToString( const char *filename, char **source )
{
int file_size;
size_t result;
@@ -211,7 +232,7 @@ int OpenclDevice::ConvertToString( const char *filename, char **source )
return 0;
}
-int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
+int OpenclDevice::binaryGenerated( const char * clFileName, FILE ** fhandle )
{
unsigned int i = 0;
cl_int clStatus;
@@ -219,16 +240,28 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
char *str = NULL;
FILE *fd = NULL;
cl_uint numDevices=0;
- clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
- CL_DEVICE_TYPE_GPU, // device_type
- 0, // num_entries
- NULL, // devices ID
- &numDevices);
+ if ( getenv("SC_OPENCLCPU") )
+ {
+ clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+ CL_DEVICE_TYPE_CPU, // device_type for CPU device
+ 0, // num_entries
+ NULL, // devices ID
+ &numDevices);
+ }
+ else
+ {
+ clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
+ CL_DEVICE_TYPE_GPU, // device_type for GPU device
+ 0, // num_entries
+ NULL, // devices ID
+ &numDevices);
+ }
+ CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
for ( i = 0; i < numDevices; i++ )
{
+ char fileName[256] = { 0 }, cl_name[128] = { 0 };
if ( gpuEnv.mpArryDevsID[i] != 0 )
{
- char fileName[256] = { 0 }, cl_name[128] = { 0 };
char deviceName[1024];
clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
@@ -248,7 +281,7 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
}
-int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
+int OpenclDevice::writeBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
{
FILE *output = NULL;
output = fopen( fileName, "wb" );
@@ -264,7 +297,7 @@ int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, s
}
-int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
+int OpenclDevice::generatBinFromKernelSource( cl_program program, const char * clFileName )
{
unsigned int i = 0;
cl_int clStatus;
@@ -307,12 +340,6 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
if ( binaries[i] == NULL )
{
- for ( unsigned int j = 0; j < i ; j++)
- {
- if (binaries[j])
- free(binaries[j]);
- }
- free(binaries);
return 0;
}
}
@@ -329,9 +356,10 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
/* dump out each binary into its own separate file. */
for ( i = 0; i < numDevices; i++ )
{
+ char fileName[256] = { 0 }, cl_name[128] = { 0 };
+
if ( binarySizes[i] != 0 )
{
- char fileName[256] = { 0 }, cl_name[128] = { 0 };
char deviceName[1024];
clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
sizeof(deviceName), deviceName, NULL);
@@ -342,7 +370,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
cl_name[str - clFileName] = '\0';
sprintf( fileName, "./%s-%s.bin", cl_name, deviceName );
- if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
+ if ( !writeBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
{
printf("opencl-wrapper: write binary[%s] failds\n", fileName);
return 0;
@@ -381,7 +409,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c
return 1;
}
-int OpenclDevice::InitOpenclAttr( OpenCLEnv * env )
+int OpenclDevice::initOpenclAttr( OpenCLEnv * env )
{
if ( gpuEnv.mnIsUserCreated )
return 1;
@@ -396,7 +424,7 @@ int OpenclDevice::InitOpenclAttr( OpenCLEnv * env )
return 0;
}
-int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env )
+int OpenclDevice::createKernel( char * kernelname, KernelEnv * env )
{
int clStatus;
@@ -406,13 +434,13 @@ int OpenclDevice::CreateKernel( char * kernelname, KernelEnv * env )
return clStatus != CL_SUCCESS ? 1 : 0;
}
-int OpenclDevice::ReleaseKernel( KernelEnv * env )
+int OpenclDevice::releaseKernel( KernelEnv * env )
{
int clStatus = clReleaseKernel( env->mpkKernel );
return clStatus != CL_SUCCESS ? 1 : 0;
}
-int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
+int OpenclDevice::releaseOpenclEnv( GPUEnv *gpuInfo )
{
int i = 0;
int clStatus = 0;
@@ -447,18 +475,18 @@ int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
return 1;
}
-int OpenclDevice::RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata )
+int OpenclDevice::runKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata )
{
- printf("oclwrapper:RunKernel_wrapper...\n");
- if ( RegisterKernelWrapper( kernelName, function ) != 1 )
+ printf("oclwrapper:runKernel_wrapper...\n");
+ if ( registerKernelWrapper( kernelName, function ) != 1 )
{
- fprintf(stderr, "Error:RunKernel_wrapper:RegisterKernelWrapper fail!\n");
+ fprintf(stderr, "Error:runKernel_wrapper:registerKernelWrapper fail!\n");
return -1;
}
- return ( RunKernel( kernelName, usrdata ) );
+ return ( runKernel( kernelName, usrdata ) );
}
-int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
+int OpenclDevice::cachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
{
int i;
for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
@@ -475,19 +503,20 @@ int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * cl
return 0;
}
-int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
+int OpenclDevice::compileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
{
cl_int clStatus = 0;
size_t length;
- char *binary;
+ char *buildLog = NULL, *binary;
const char *source;
size_t source_size[1];
- int binary_status, binaryExisted, idx;
- cl_uint numDevices;
- FILE *fd;
+ int b_error, binary_status, binaryExisted, idx;
+ size_t numDevices;
+ cl_device_id *mpArryDevsID;
+ FILE *fd, *fd1;
const char* filename = "kernel.cl";
- fprintf(stderr, "CompileKernelFile ... \n");
- if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
+ fprintf(stderr, "compileKernelFile ... \n");
+ if ( cachedOfKernerPrg(gpuInfo, filename) == 1 )
{
return 1;
}
@@ -498,33 +527,31 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
source_size[0] = strlen( source );
binaryExisted = 0;
- if ( ( binaryExisted = BinaryGenerated( filename, &fd ) ) == 1 )
+ if ( ( binaryExisted = binaryGenerated( filename, &fd ) ) == 1 )
{
clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
sizeof(numDevices), &numDevices, NULL );
CHECK_OPENCL( clStatus, "clGetContextInfo" );
- cl_device_id *mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
+ mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
if ( mpArryDevsID == NULL )
{
return 0;
}
- int b_error = 0;
+ b_error = 0;
length = 0;
b_error |= fseek( fd, 0, SEEK_END ) < 0;
b_error |= ( length = ftell(fd) ) <= 0;
b_error |= fseek( fd, 0, SEEK_SET ) < 0;
if ( b_error )
{
- free(mpArryDevsID);
return 0;
}
binary = (char*) malloc( length + 2 );
if ( !binary )
{
- free(mpArryDevsID);
return 0;
}
@@ -537,12 +564,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
// grab the handles to all of the devices in the context.
clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
- if (clStatus != CL_SUCCESS)
- {
- fprintf (stderr, "OpenCL error code is %d at " SAL_DETAIL_WHERE " when clGetContextInfo .\n", clStatus);
- free(binary);
- return 0;
- }
+ CHECK_OPENCL( clStatus, "clGetContextInfo" );
fprintf(stderr, "Create kernel from binary\n");
gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
@@ -600,7 +622,7 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
printf("opencl create build log fail\n");
return 0;
}
- char* buildLog = (char*) malloc( length );
+ buildLog = (char*) malloc( length );
if ( buildLog == (char*) NULL )
{
return 0;
@@ -618,11 +640,10 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
if ( clStatus != CL_SUCCESS )
{
printf("opencl program build info fail\n");
- free(buildLog);
return 0;
}
- FILE *fd1 = fopen( "kernel-build.log", "w+" );
+ fd1 = fopen( "kernel-build.log", "w+" );
if ( fd1 != NULL )
{
fwrite( buildLog, sizeof(char), length, fd1 );
@@ -636,17 +657,17 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
strcpy( gpuEnv.mArryKnelSrcFile[idx], filename );
if ( binaryExisted == 0 )
- GeneratBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename );
+ generatBinFromKernelSource( gpuEnv.mpArryPrograms[idx], filename );
gpuInfo->mnFileCount += 1;
return 1;
}
-int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function)
+int OpenclDevice::getKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function)
{
int i;
- //printf("----------------OpenclDevice::GetKernelEnvAndFunc\n");
+ //printf("----------------OpenclDevice::getKernelEnvAndFunc\n");
for ( i = 0; i < gpuEnv.mnKernelCount; i++ )
{
if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 )
@@ -662,14 +683,14 @@ int OpenclDevice::GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, c
return 0;
}
-int OpenclDevice::RunKernel( const char *kernelName, void **userdata)
+int OpenclDevice::runKernel( const char *kernelName, void **userdata)
{
KernelEnv kEnv;
cl_kernel_function function;
int status;
memset( &kEnv, 0, sizeof( KernelEnv ) );
- status = GetKernelEnvAndFunc( kernelName, &kEnv, &function );
+ status = getKernelEnvAndFunc( kernelName, &kEnv, &function );
strcpy( kEnv.mckKernelName, kernelName );
if ( status == 1 )
{
@@ -680,8 +701,9 @@ int OpenclDevice::RunKernel( const char *kernelName, void **userdata)
return 0;
}
-int OpenclDevice::InitOpenclRunEnv( int argc )
+int OpenclDevice::initOpenclRunEnv( int argc )
{
+ int status = 0;
if ( MAX_CLKERNEL_NUM <= 0 )
{
return 1;
@@ -691,9 +713,9 @@ int OpenclDevice::InitOpenclRunEnv( int argc )
if ( !isInited )
{
- RegistOpenclKernel();
+ registOpenclKernel();
//initialize devices, context, comand_queue
- int status = InitOpenclRunEnv( &gpuEnv );
+ status = initOpenclRunEnv( &gpuEnv );
if ( status )
{
printf("init_opencl_env failed.\n");
@@ -709,36 +731,38 @@ int OpenclDevice::InitOpenclRunEnv( int argc )
if( gpuEnv.mnKhrFp64Flag )
{
printf("----use khr double type in kernel----\n");
- status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double" );
+ status = compileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
}
else if( gpuEnv.mnAmdFp64Flag )
{
printf("----use amd double type in kernel----\n");
- status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double" );
+ status = compileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
}
else
{
printf("----use float type in kernel----\n");
- status = CompileKernelFile( &gpuEnv, "-Dfp_t=float" );
+ status = compileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
}
if ( status == 0 || gpuEnv.mnKernelCount == 0 )
{
- printf("CompileKernelFile failed.\n");
+ printf("compileKernelFile failed.\n");
return 1;
}
- printf("CompileKernelFile successed.\n");
+ printf("compileKernelFile successed.\n");
isInited = 1;
}
return 0;
}
-int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
+int OpenclDevice::initOpenclRunEnv( GPUEnv *gpuInfo )
{
size_t length;
cl_int clStatus;
cl_uint numPlatforms, numDevices;
cl_platform_id *platforms;
cl_context_properties cps[3];
+ char platformName[256];
+ unsigned int i;
// Have a look at the available platforms.
@@ -765,8 +789,7 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
return 1;
}
- char platformName[256];
- for ( size_t i = 0; i < numPlatforms; i++ )
+ for ( i = 0; i < numPlatforms; i++ )
{
clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
sizeof( platformName ), platformName, NULL );
@@ -781,13 +804,22 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
//if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
{
gpuInfo->mpPlatformID = platforms[i];
-
- clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
- CL_DEVICE_TYPE_GPU, // device_type
- 0, // num_entries
- NULL, // devices
- &numDevices);
-
+ if ( getenv("SC_OPENCLCPU") )
+ {
+ clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+ CL_DEVICE_TYPE_CPU, // device_type for CPU device
+ 0, // num_entries
+ NULL, // devices
+ &numDevices);
+ }
+ else
+ {
+ clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
+ CL_DEVICE_TYPE_GPU, // device_type for GPU device
+ 0, // num_entries
+ NULL, // devices
+ &numDevices);
+ }
if ( clStatus != CL_SUCCESS )
continue;
@@ -806,8 +838,15 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
cps[0] = CL_CONTEXT_PLATFORM;
cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
cps[2] = 0;
- // Check for GPU.
- gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
+ // Set device type for OpenCL
+ if ( getenv("SC_OPENCLCPU") )
+ {
+ gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
+ }
+ else
+ {
+ gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
+ }
gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
@@ -845,7 +884,6 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
}
clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
- CHECK_OPENCL( clStatus, "GetCommandQueueInfo" );
// Check device extensions for double type
size_t aDevExtInfoSize = 0;
@@ -875,10 +913,10 @@ int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
return 0;
}
-int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_function function )
+int OpenclDevice::registerKernelWrapper( const char *kernelName, cl_kernel_function function )
{
int i;
- //printf("oclwrapper:RegisterKernelWrapper...%d\n", gpuEnv.mnKernelCount);
+ //printf("oclwrapper:registerKernelWrapper...%d\n", gpuEnv.mnKernelCount);
for ( i = 0; i < gpuEnv.mnKernelCount; i++ )
{
if ( strcasecmp( kernelName, gpuEnv.mArrykernelNames[i]) == 0 )
@@ -890,13 +928,13 @@ int OpenclDevice::RegisterKernelWrapper( const char *kernelName, cl_kernel_funct
return 0;
}
-void OpenclDevice::SetOpenclState( int state )
+void OpenclDevice::setOpenclState( int state )
{
//printf("OpenclDevice::setOpenclState...\n");
isInited = state;
}
-int OpenclDevice::GetOpenclState()
+int OpenclDevice::getOpenclState()
{
return isInited;
}
@@ -906,54 +944,46 @@ OclCalc::OclCalc()
fprintf(stderr,"OclCalc:: init opencl ...\n");
nFormulaColSize = 0;
nFormulaRowSize = 0;
+ nArithmeticLen = 0;
+ nFormulaLen = 0;
+ mpClmemSrcData = NULL;
+ mpClmemStartPos = NULL;
+ mpClmemEndPos = NULL;
+ mpClmemLeftData = NULL;
+ mpClmemRightData = NULL;
+ mpClmemMergeLfData = NULL;
+ mpClmemMergeRtData = NULL;
+ mpClmemMatixSumSize = NULL;
+ mpClmemeOp = NULL;
}
OclCalc::~OclCalc()
{
- fprintf(stderr,"OclCalc:: opencl end ...\n");
+ releaseOclBuffer();
}
-
-/////////////////////////////////////////////////////////////////////////////
-int OclCalc::CreateBuffer64Bits( double *&dpSrcData, uint *&npStartPos, uint *&npEndPos, int nBufferSize )
+int OclCalc::releaseOclBuffer(void)
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
-
- mpClmemSrcData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(double), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemStartPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(unsigned int), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
- mpClmemEndPos = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
- nBufferSize * sizeof(unsigned int), NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clCreateBuffer" );
-
- dpSrcData = (double *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemSrcData, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(double), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL(clStatus,"clEnqueueMapBuffer");
- clFinish(kEnv.mpkCmdQueue);
- npStartPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemStartPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus, "clEnqueueMapBuffer" );
- clFinish( kEnv.mpkCmdQueue );
- npEndPos = (uint *)clEnqueueMapBuffer( kEnv.mpkCmdQueue, mpClmemEndPos, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0,
- nBufferSize * sizeof(uint), 0, NULL, NULL, &clStatus );
- CHECK_OPENCL( clStatus,"clEnqueueMapBuffer" );
- clFinish( kEnv.mpkCmdQueue );
- //printf("In CreateBuffer, pptrr is %d,%d,%d\n",fpSrcData,npStartPos,npEndPos);
- return 0;
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemSrcData );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemStartPos );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemEndPos );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemLeftData );
+ CHECK_OPENCL_RELEASE( clStatus, mpClmemRightData );
+ fprintf(stderr,"OclCalc:: opencl end ...\n");
+ return 1;
}
-int OclCalc::CreateBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize )
+/////////////////////////////////////////////////////////////////////////////
+
+int OclCalc::createBuffer64Bits( double *&dpLeftData, double *&dpRightData, int nBufferSize )
{
cl_int clStatus = 0;
- SetKernelEnv( &kEnv );
+ setKernelEnv( &kEnv );
- mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR),
+ mpClmemLeftData = clCreateBuffer( kEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR ),
... etc. - the rest is truncated
More information about the Libreoffice-commits
mailing list