[Libreoffice-commits] core.git: Branch 'private/tml/splitgroup' - 2 commits - include/clew include/opencl opencl/source sc/source

Tue Sep 15 05:45:35 PDT 2015

Rebased ref, commits from common ancestor:
commit 5a16406bf276bffce7b493ceaccf1085d42554c3
Author: Tor Lillqvist <tml at collabora.com>
Date:   Tue Sep 15 13:27:18 2015 +0300

    Use heuristic to find out whether to split formula groups for OpenCL
    
    It is necessary to not perform too large OpenCL computations on some
    low-end devices on Windows as the driver will be unresponsive too
    long. On these devices, the CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT is
    4, while for more performant devices it is 1 or 8.

diff --git a/include/clew/clew.h b/include/clew/clew.h
index 94b6c29..e5cfaf0 100644
--- a/include/clew/clew.h
+++ b/include/clew/clew.h
@@ -416,6 +416,7 @@ typedef struct _cl_image_format {
 
 // cl_device_info
 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
 #define CL_DEVICE_NAME                              0x102B
diff --git a/include/opencl/openclwrapper.hxx b/include/opencl/openclwrapper.hxx
index 75ecbc8..e3f967e 100644
--- a/include/opencl/openclwrapper.hxx
+++ b/include/opencl/openclwrapper.hxx
@@ -52,6 +52,7 @@ struct GPUEnv
     int mnCmdQueuePos;
     bool mnKhrFp64Flag;
     bool mnAmdFp64Flag;
+    cl_uint mnPreferredVectorWidthFloat;
 };
 
 extern OPENCL_DLLPUBLIC GPUEnv gpuEnv;
diff --git a/opencl/source/openclwrapper.cxx b/opencl/source/openclwrapper.cxx
index 5574d2c..9d03a27 100644
--- a/opencl/source/openclwrapper.cxx
+++ b/opencl/source/openclwrapper.cxx
@@ -501,6 +501,11 @@ bool initOpenCLRunEnv( GPUEnv *gpuInfo )
     gpuInfo->mnKhrFp64Flag = bKhrFp64;
     gpuInfo->mnAmdFp64Flag = bAmdFp64;
 
+    gpuInfo->mnPreferredVectorWidthFloat = 0;
+
+    clGetDeviceInfo(gpuInfo->mpArryDevsID[0], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint),
+                    &gpuInfo->mnPreferredVectorWidthFloat, NULL);
+
     return false;
 }
 
diff --git a/sc/source/core/data/formulacell.cxx b/sc/source/core/data/formulacell.cxx
index 774aed3..7f8180f 100644
--- a/sc/source/core/data/formulacell.cxx
+++ b/sc/source/core/data/formulacell.cxx
@@ -55,6 +55,7 @@
 #include "types.hxx"
 #include "scopetools.hxx"
 #include "refupdatecontext.hxx"
+#include <opencl/openclwrapper.hxx>
 #include <tokenstringcontext.hxx>
 #include <refhint.hxx>
 #include <listenerquery.hxx>
@@ -3838,10 +3839,21 @@ bool ScFormulaCell::InterpretFormulaGroup()
 
     // Should obviously be based on some heuristics based on the kind
     // of OpenCL device or some of its properties.
-    const int MAXGROUPLENGTH = (std::getenv("MAXGROUPLENGTH") ? std::atoi(std::getenv("MAXGROUPLENGTH")) : 1000);
+    int nMaxGroupLength = INT_MAX;
+
+#ifdef WNT
+    // Heuristic: Certain old low-end OpenCL implementations don't
+    // work for us with too large group lengths. 1000 was determined
+    // empirically to be a good compromise.
+    if (opencl::gpuEnv.mnPreferredVectorWidthFloat == 4)
+        nMaxGroupLength = 1000;
+#endif
+
+    if (std::getenv("SC_MAX_GROUP_LENGTH"))
+        nMaxGroupLength = std::atoi(std::getenv("SC_MAX_GROUP_LENGTH"));
 
     int nNumOnePlus;
-    const int nNumParts = splitup(GetSharedLength(), MAXGROUPLENGTH, nNumOnePlus);
+    const int nNumParts = splitup(GetSharedLength(), nMaxGroupLength, nNumOnePlus);
 
     int nOffset = 0;
     int nCurChunkSize;
@@ -3856,7 +3868,7 @@ bool ScFormulaCell::InterpretFormulaGroup()
             xGroup = mxGroup;
         else
         {
-            // Possibly incorrect hack
+            // Ugly hack
             xGroup = new ScFormulaCellGroup();
             xGroup->mpTopCell = mxGroup->mpTopCell;
             xGroup->mpTopCell->aPos = aOrigPos;
commit 3a01670c881cd51417aa48eb1347df04c7b2f9b1
Author: Tor Lillqvist <tml at collabora.com>
Date:   Thu Sep 10 21:58:28 2015 +0300

    Split formula group for OpenCL up into smaller bits
    
    Will make it less demanding on low-end hardware. For now split into
    pieces at most 1000 rows long, but this will need to use some
    heuristics that actually detects when it is necessary to split.
    
    I tried several approaches of splitting, both at higher levels in sc
    and at the lowest level just before creating and executing the OpenCL
    kernel(s). This seems to be the most minimal and local approach. Doing
    it at a lower level would have required too much poking into our
    obscure OpenCL code, like passing an "offiset" parameter to every
    kernel.
    
    Change-Id: Iec1416441e0323724c9b64589310faa61a7da5f0

diff --git a/sc/source/core/data/formulacell.cxx b/sc/source/core/data/formulacell.cxx
index eb2b374..774aed3 100644
--- a/sc/source/core/data/formulacell.cxx
+++ b/sc/source/core/data/formulacell.cxx
@@ -20,6 +20,7 @@
 #include <sal/config.h>
 
 #include <cassert>
+#include <cstdlib>
 
 #include "formulacell.hxx"
 #include "grouptokenconverter.hxx"
@@ -3770,6 +3771,36 @@ ScFormulaCell::CompareState ScFormulaCell::CompareByTokenArray( ScFormulaCell& r
     return bInvariant ? EqualInvariant : EqualRelativeRef;
 }
 
+namespace {
+
+// Split N into optimally equal-sized pieces, each not larger than K.
+// Return value P is number of pieces. A returns the number of pieces
+// one larger than N/P, 0..P-1.
+
+int splitup(int N, int K, int& A)
+{
+    assert(N > 0);
+    assert(K > 0);
+
+    A = 0;
+
+    if (N <= K)
+        return 1;
+
+    const int ideal_num_parts = N / K;
+    if (ideal_num_parts * K == N)
+        return ideal_num_parts;
+
+    const int num_parts = ideal_num_parts + 1;
+    const int nominal_part_size = N / num_parts;
+
+    A = N - num_parts * nominal_part_size;
+
+    return num_parts;
+}
+
+} // anonymous namespace
+
 bool ScFormulaCell::InterpretFormulaGroup()
 {
     if (!officecfg::Office::Common::Misc::UseOpenCL::get())
@@ -3805,30 +3836,84 @@ bool ScFormulaCell::InterpretFormulaGroup()
     if (mxGroup->mbInvariant && false)
         return InterpretInvariantFormulaGroup();
 
-    ScTokenArray aCode;
-    ScAddress aTopPos = aPos;
-    aTopPos.SetRow(mxGroup->mpTopCell->aPos.Row());
-    ScGroupTokenConverter aConverter(aCode, *pDocument, *this, mxGroup->mpTopCell->aPos);
-    std::vector<ScTokenArray*> aLoopControl;
-    if (!aConverter.convert(*pCode, aLoopControl))
-    {
-        SAL_INFO("sc.opencl", "conversion of group " << this << " failed, disabling");
-        mxGroup->meCalcState = sc::GroupCalcDisabled;
-        return false;
-    }
+    // Should obviously be based on some heuristics based on the kind
+    // of OpenCL device or some of its properties.
+    const int MAXGROUPLENGTH = (std::getenv("MAXGROUPLENGTH") ? std::atoi(std::getenv("MAXGROUPLENGTH")) : 1000);
 
-    // The converted code does not have RPN tokens yet.  The interpreter will
-    // generate them.
-    mxGroup->meCalcState = sc::GroupCalcRunning;
-    sc::FormulaGroupInterpreter *pInterpreter = sc::FormulaGroupInterpreter::getStatic();
-    if (pInterpreter == NULL ||
-        !pInterpreter->interpret(*pDocument, mxGroup->mpTopCell->aPos, mxGroup, aCode))
+    int nNumOnePlus;
+    const int nNumParts = splitup(GetSharedLength(), MAXGROUPLENGTH, nNumOnePlus);
+
+    int nOffset = 0;
+    int nCurChunkSize;
+    ScAddress aOrigPos = mxGroup->mpTopCell->aPos;
+    for (int i = 0; i < nNumParts; i++, nOffset += nCurChunkSize)
     {
-        SAL_INFO("sc.opencl", "interpreting group " << mxGroup << " (state " << (int) mxGroup->meCalcState << ") failed, disabling");
-        mxGroup->meCalcState = sc::GroupCalcDisabled;
-        return false;
+        nCurChunkSize = GetSharedLength()/nNumParts + (i < nNumOnePlus ? 1 : 0);
+
+        ScFormulaCellGroupRef xGroup;
+
+        if (nNumParts == 1)
+            xGroup = mxGroup;
+        else
+        {
+            // Possibly incorrect hack
+            xGroup = new ScFormulaCellGroup();
+            xGroup->mpTopCell = mxGroup->mpTopCell;
+            xGroup->mpTopCell->aPos = aOrigPos;
+            xGroup->mpTopCell->aPos.IncRow(nOffset);
+            xGroup->mbInvariant = mxGroup->mbInvariant;
+            xGroup->mnLength = nCurChunkSize;
+            xGroup->mpCode = mxGroup->mpCode;
+        }
+
+        ScTokenArray aCode;
+        ScGroupTokenConverter aConverter(aCode, *pDocument, *this, xGroup->mpTopCell->aPos);
+        std::vector<ScTokenArray*> aLoopControl;
+        if (!aConverter.convert(*pCode, aLoopControl))
+        {
+            SAL_INFO("sc.opencl", "conversion of group " << this << " failed, disabling");
+            mxGroup->meCalcState = sc::GroupCalcDisabled;
+
+            // Undo the hack above
+            if (nNumParts > 1)
+            {
+                mxGroup->mpTopCell->aPos = aOrigPos;
+                xGroup->mpTopCell = NULL;
+                xGroup->mpCode = NULL;
+            }
+
+            return false;
+        }
+
+        // The converted code does not have RPN tokens yet.  The interpreter will
+        // generate them.
+        xGroup->meCalcState = mxGroup->meCalcState = sc::GroupCalcRunning;
+        sc::FormulaGroupInterpreter *pInterpreter = sc::FormulaGroupInterpreter::getStatic();
+        if (pInterpreter == NULL ||
+            !pInterpreter->interpret(*pDocument, xGroup->mpTopCell->aPos, xGroup, aCode))
+        {
+            SAL_INFO("sc.opencl", "interpreting group " << mxGroup << " (state " << (int) mxGroup->meCalcState << ") failed, disabling");
+            mxGroup->meCalcState = sc::GroupCalcDisabled;
+
+            // Undo the hack above
+            if (nNumParts > 1)
+            {
+                mxGroup->mpTopCell->aPos = aOrigPos;
+                xGroup->mpTopCell = NULL;
+                xGroup->mpCode = NULL;
+            }
+
+            return false;
+        }
+        if (nNumParts > 1)
+        {
+            xGroup->mpTopCell = NULL;
+            xGroup->mpCode = NULL;
+        }
     }
 
+    if (nNumParts > 1)
+        mxGroup->mpTopCell->aPos = aOrigPos;
     mxGroup->meCalcState = sc::GroupCalcEnabled;
     return true;
 }