[Libreoffice-commits] core.git: include/clew include/opencl opencl/source sc/source
Tor Lillqvist
tml at collabora.com
Tue Sep 15 08:44:39 PDT 2015
include/clew/clew.h | 1
include/opencl/openclwrapper.hxx | 1
opencl/source/openclwrapper.cxx | 5 +
sc/source/core/data/formulacell.cxx | 134 +++++++++++++++++++++++++++++++-----
4 files changed, 123 insertions(+), 18 deletions(-)
New commits:
commit d6a5aac0f903b292db57bb4a613e603aa029c78b
Author: Tor Lillqvist <tml at collabora.com>
Date: Thu Sep 10 21:58:28 2015 +0300
Split formula group for OpenCL up into smaller bits when necessary
Will make it less demanding on low-end hardware, where the device
driver is unresponsive for too long when an OpenCL kernel handling lots
of data is executing. This makes Windows restart the driver which is
problematic.
I tried several approaches of splitting, both at higher levels in sc
and at the lowest level just before creating and executing the OpenCL
kernel(s). This seems to be the most minimal and local approach. Doing
it at the lower level would have required too much poking into our
obscure OpenCL code, like passing an offset parameter to every kernel.
Use a simple heuristic to find out whether to split. On the
problematic low-end devices, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT is
4, while for more performant devices it is 1 or 8.
diff --git a/include/clew/clew.h b/include/clew/clew.h
index 94b6c29..e5cfaf0 100644
--- a/include/clew/clew.h
+++ b/include/clew/clew.h
@@ -416,6 +416,7 @@ typedef struct _cl_image_format {
// cl_device_info
#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
#define CL_DEVICE_NAME 0x102B
diff --git a/include/opencl/openclwrapper.hxx b/include/opencl/openclwrapper.hxx
index 04fe1e3..fe67772 100644
--- a/include/opencl/openclwrapper.hxx
+++ b/include/opencl/openclwrapper.hxx
@@ -51,6 +51,7 @@ struct GPUEnv
int mnCmdQueuePos;
bool mnKhrFp64Flag;
bool mnAmdFp64Flag;
+ cl_uint mnPreferredVectorWidthFloat;
};
extern OPENCL_DLLPUBLIC GPUEnv gpuEnv;
diff --git a/opencl/source/openclwrapper.cxx b/opencl/source/openclwrapper.cxx
index ea0f3f8..79aabba 100644
--- a/opencl/source/openclwrapper.cxx
+++ b/opencl/source/openclwrapper.cxx
@@ -501,6 +501,11 @@ bool initOpenCLRunEnv( GPUEnv *gpuInfo )
gpuInfo->mnKhrFp64Flag = bKhrFp64;
gpuInfo->mnAmdFp64Flag = bAmdFp64;
+ gpuInfo->mnPreferredVectorWidthFloat = 0;
+
+ clGetDeviceInfo(gpuInfo->mpArryDevsID[0], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint),
+ &gpuInfo->mnPreferredVectorWidthFloat, NULL);
+
return false;
}
diff --git a/sc/source/core/data/formulacell.cxx b/sc/source/core/data/formulacell.cxx
index 27796ae..14d3d44 100644
--- a/sc/source/core/data/formulacell.cxx
+++ b/sc/source/core/data/formulacell.cxx
@@ -20,6 +20,7 @@
#include <sal/config.h>
#include <cassert>
+#include <cstdlib>
#include "formulacell.hxx"
#include "grouptokenconverter.hxx"
@@ -54,6 +55,7 @@
#include "types.hxx"
#include "scopetools.hxx"
#include "refupdatecontext.hxx"
+#include <opencl/openclwrapper.hxx>
#include <tokenstringcontext.hxx>
#include <refhint.hxx>
#include <listenerquery.hxx>
@@ -3899,6 +3901,36 @@ ScFormulaCell::CompareState ScFormulaCell::CompareByTokenArray( ScFormulaCell& r
return bInvariant ? EqualInvariant : EqualRelativeRef;
}
+namespace {
+
+// Split N into optimally equal-sized pieces, each not larger than K.
+// Return value P is number of pieces. A returns the number of pieces
+// one larger than N/P, 0..P-1.
+
+int splitup(int N, int K, int& A)
+{
+ assert(N > 0);
+ assert(K > 0);
+
+ A = 0;
+
+ if (N <= K)
+ return 1;
+
+ const int ideal_num_parts = N / K;
+ if (ideal_num_parts * K == N)
+ return ideal_num_parts;
+
+ const int num_parts = ideal_num_parts + 1;
+ const int nominal_part_size = N / num_parts;
+
+ A = N - num_parts * nominal_part_size;
+
+ return num_parts;
+}
+
+} // anonymous namespace
+
bool ScFormulaCell::InterpretFormulaGroup()
{
if (!officecfg::Office::Common::Misc::UseOpenCL::get())
@@ -3934,28 +3966,94 @@ bool ScFormulaCell::InterpretFormulaGroup()
if (mxGroup->mbInvariant && false)
return InterpretInvariantFormulaGroup();
- ScTokenArray aCode;
- ScGroupTokenConverter aConverter(aCode, *pDocument, *this, mxGroup->mpTopCell->aPos);
- std::vector<ScTokenArray*> aLoopControl;
- if (!aConverter.convert(*pCode, aLoopControl))
- {
- SAL_INFO("sc.opencl", "conversion of group " << this << " failed, disabling");
- mxGroup->meCalcState = sc::GroupCalcDisabled;
- return false;
- }
+ int nMaxGroupLength = INT_MAX;
+
+#ifdef WNT
+ // Heuristic: Certain old low-end OpenCL implementations don't
+ // work for us with too large group lengths. 1000 was determined
+ // empirically to be a good compromise. Looking at the preferred
+ // float vector width seems to be a way to detect these devices.
+ if (opencl::gpuEnv.mnPreferredVectorWidthFloat == 4)
+ nMaxGroupLength = 1000;
+#endif
+
+ if (std::getenv("SC_MAX_GROUP_LENGTH"))
+ nMaxGroupLength = std::atoi(std::getenv("SC_MAX_GROUP_LENGTH"));
+
+ int nNumOnePlus;
+ const int nNumParts = splitup(GetSharedLength(), nMaxGroupLength, nNumOnePlus);
- // The converted code does not have RPN tokens yet. The interpreter will
- // generate them.
- mxGroup->meCalcState = sc::GroupCalcRunning;
- sc::FormulaGroupInterpreter *pInterpreter = sc::FormulaGroupInterpreter::getStatic();
- if (pInterpreter == NULL ||
- !pInterpreter->interpret(*pDocument, mxGroup->mpTopCell->aPos, mxGroup, aCode))
+ int nOffset = 0;
+ int nCurChunkSize;
+ ScAddress aOrigPos = mxGroup->mpTopCell->aPos;
+ for (int i = 0; i < nNumParts; i++, nOffset += nCurChunkSize)
{
- SAL_INFO("sc.opencl", "interpreting group " << mxGroup << " (state " << (int) mxGroup->meCalcState << ") failed, disabling");
- mxGroup->meCalcState = sc::GroupCalcDisabled;
- return false;
+ nCurChunkSize = GetSharedLength()/nNumParts + (i < nNumOnePlus ? 1 : 0);
+
+ ScFormulaCellGroupRef xGroup;
+
+ if (nNumParts == 1)
+ xGroup = mxGroup;
+ else
+ {
+ // Ugly hack
+ xGroup = new ScFormulaCellGroup();
+ xGroup->mpTopCell = mxGroup->mpTopCell;
+ xGroup->mpTopCell->aPos = aOrigPos;
+ xGroup->mpTopCell->aPos.IncRow(nOffset);
+ xGroup->mbInvariant = mxGroup->mbInvariant;
+ xGroup->mnLength = nCurChunkSize;
+ xGroup->mpCode = mxGroup->mpCode;
+ }
+
+ ScTokenArray aCode;
+ ScGroupTokenConverter aConverter(aCode, *pDocument, *this, xGroup->mpTopCell->aPos);
+ std::vector<ScTokenArray*> aLoopControl;
+ if (!aConverter.convert(*pCode, aLoopControl))
+ {
+ SAL_INFO("sc.opencl", "conversion of group " << this << " failed, disabling");
+ mxGroup->meCalcState = sc::GroupCalcDisabled;
+
+ // Undo the hack above
+ if (nNumParts > 1)
+ {
+ mxGroup->mpTopCell->aPos = aOrigPos;
+ xGroup->mpTopCell = NULL;
+ xGroup->mpCode = NULL;
+ }
+
+ return false;
+ }
+
+ // The converted code does not have RPN tokens yet. The interpreter will
+ // generate them.
+ xGroup->meCalcState = mxGroup->meCalcState = sc::GroupCalcRunning;
+ sc::FormulaGroupInterpreter *pInterpreter = sc::FormulaGroupInterpreter::getStatic();
+ if (pInterpreter == NULL ||
+ !pInterpreter->interpret(*pDocument, xGroup->mpTopCell->aPos, xGroup, aCode))
+ {
+ SAL_INFO("sc.opencl", "interpreting group " << mxGroup << " (state " << (int) mxGroup->meCalcState << ") failed, disabling");
+ mxGroup->meCalcState = sc::GroupCalcDisabled;
+
+ // Undo the hack above
+ if (nNumParts > 1)
+ {
+ mxGroup->mpTopCell->aPos = aOrigPos;
+ xGroup->mpTopCell = NULL;
+ xGroup->mpCode = NULL;
+ }
+
+ return false;
+ }
+ if (nNumParts > 1)
+ {
+ xGroup->mpTopCell = NULL;
+ xGroup->mpCode = NULL;
+ }
}
+ if (nNumParts > 1)
+ mxGroup->mpTopCell->aPos = aOrigPos;
mxGroup->meCalcState = sc::GroupCalcEnabled;
return true;
}
More information about the Libreoffice-commits
mailing list