[Libreoffice-commits] core.git: Branch 'feature/fixes12' - include/tools sc/source tools/Library_tl.mk tools/source

Tomaž Vajngerl tomaz.vajngerl at collabora.com
Fri Nov 13 07:01:36 PST 2015


 include/tools/cpuid.hxx                |   27 ++++++
 sc/source/core/inc/arraysumfunctor.hxx |  141 +++++++++++++++++++++++++++++++++
 sc/source/core/tool/interpr6.cxx       |   15 ---
 tools/Library_tl.mk                    |    1 
 tools/source/misc/cpuid.cxx            |   56 +++++++++++++
 5 files changed, 229 insertions(+), 11 deletions(-)

New commits:
commit a3f02cf37400265b60977f36c7d8bde45c3425ba
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.com>
Date:   Fri Nov 13 12:24:35 2015 +0100

    arraysumfunctor: fast sum a double array, use for SUM() in Calc
    
    Commits cherry-picked from master:
    
    1. tools: runtime SSE/SSE2 detection
    (commit 154bcd887d3772addc8196944044fa57738d3cf2)
    
    Change-Id: I29330061e2986ec2ae899c2f3a63d0eadd9cc194
    
    2. arraysumfunctor: fast sum a double array, use for SUM() in Calc
    (commit 5493402fb37a1def960c93f7c31aff36a5ab5f9e)
    
    This adds an array sum functor which sums a double array in a
    as fast as possible way. There are 2 implementations: SSE2 and
    a simple unrolled implementation. SSE2 implementation is used if
    SSE2 is detected at runtime.
    
    Additional info:
    SSE implementation at first processes the array until the array is
    aligned by 16-bit boundary (should only process 1 element).
    Then the array is processed by summing 8 values in one pass (using
    4 variables that are 128-bit wide) where SSE operation can process
    2 double values in one call.
    
    Change-Id: I24494b08cae049aa3eabcb086867f1bdd4128374
    
    3. remove SSE detection code (but keep SSE2)
    (commit 726ce582abb800a809ac144f50a7aa20e3fadcef)
    
    For corner case CPUs out there that support SSE and not SSE2 it
    makes more sense to use the "fallback" code path instead of
    writing a SSE only version. For this reason detecting SSE is not
    relevant anymore - so removing it.
    
    Change-Id: I3f1425af2cb5cdf9fba699e2996014598a15b5c1

diff --git a/include/tools/cpuid.hxx b/include/tools/cpuid.hxx
new file mode 100644
index 0000000..2445129
--- /dev/null
+++ b/include/tools/cpuid.hxx
@@ -0,0 +1,27 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#ifndef INCLUDED_TOOLS_CPUID_HXX
+#define INCLUDED_TOOLS_CPUID_HXX
+
+#include <sal/config.h>
+#include <tools/toolsdllapi.h>
+
+namespace tools
+{
+namespace cpuid
+{
+    TOOLS_DLLPUBLIC bool hasSSE2();
+}
+}
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx
new file mode 100644
index 0000000..776c514
--- /dev/null
+++ b/sc/source/core/inc/arraysumfunctor.hxx
@@ -0,0 +1,141 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#ifndef INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
+#define INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
+
+#include <emmintrin.h>
+#include <tools/cpuid.hxx>
+
+namespace sc
+{
+
+template<typename T, unsigned int N>
+inline bool isAligned(const T* pointer)
+{
+    return 0 == (uintptr_t(pointer) % N);
+}
+
+struct ArraySumFunctor
+{
+private:
+    const double* mpArray;
+    size_t mnSize;
+
+public:
+    ArraySumFunctor(const double* pArray, size_t nSize)
+        : mpArray(pArray)
+        , mnSize(nSize)
+    {
+    }
+
+    double operator() ()
+    {
+        static bool hasSSE2 = tools::cpuid::hasSSE2();
+
+        double fSum = 0.0;
+        size_t i = 0;
+        const double* pCurrent = mpArray;
+
+        if (hasSSE2)
+        {
+            while (!isAligned<double, 16>(pCurrent))
+            {
+                fSum += *pCurrent++;
+                i++;
+            }
+            fSum += executeSSE2(i, pCurrent);
+        }
+        else
+            fSum += executeUnrolled(i, pCurrent);
+
+        // sum rest of the array
+
+        for (; i < mnSize; ++i)
+            fSum += mpArray[i];
+
+        return fSum;
+    }
+
+private:
+    inline double executeSSE2(size_t& i, const double* pCurrent) const
+    {
+        double fSum = 0.0;
+        size_t nRealSize = mnSize - i;
+        size_t nUnrolledSize = nRealSize - (nRealSize % 8);
+
+        if (nUnrolledSize > 0)
+        {
+            __m128d sum1 = _mm_setzero_pd();
+            __m128d sum2 = _mm_setzero_pd();
+            __m128d sum3 = _mm_setzero_pd();
+            __m128d sum4 = _mm_setzero_pd();
+
+            for (; i < nUnrolledSize; i += 8)
+            {
+                __m128d load1 = _mm_load_pd(pCurrent);
+                sum1 = _mm_add_pd(sum1, load1);
+                pCurrent += 2;
+
+                __m128d load2 = _mm_load_pd(pCurrent);
+                sum2 = _mm_add_pd(sum2, load2);
+                pCurrent += 2;
+
+                __m128d load3 = _mm_load_pd(pCurrent);
+                sum3 = _mm_add_pd(sum3, load3);
+                pCurrent += 2;
+
+                __m128d load4 = _mm_load_pd(pCurrent);
+                sum4 = _mm_add_pd(sum4, load4);
+                pCurrent += 2;
+            }
+            sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));
+
+            double temp;
+
+            _mm_storel_pd(&temp, sum1);
+            fSum += temp;
+
+            _mm_storeh_pd(&temp, sum1);
+            fSum += temp;
+        }
+        return fSum;
+    }
+
+    inline double executeUnrolled(size_t& i, const double* pCurrent) const
+    {
+        size_t nRealSize = mnSize - i;
+        size_t nUnrolledSize = nRealSize - (nRealSize % 4);
+
+        if (nUnrolledSize > 0)
+        {
+            double sum0 = 0.0;
+            double sum1 = 0.0;
+            double sum2 = 0.0;
+            double sum3 = 0.0;
+
+            for (; i < nUnrolledSize; i += 4)
+            {
+                sum0 += *pCurrent++;
+                sum1 += *pCurrent++;
+                sum2 += *pCurrent++;
+                sum3 += *pCurrent++;
+            }
+            return sum0 + sum1 + sum2 + sum3;
+        }
+        return 0.0;
+    }
+};
+
+} // end namespace sc
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/core/tool/interpr6.cxx b/sc/source/core/tool/interpr6.cxx
index a4a75f2..5bf4533 100644
--- a/sc/source/core/tool/interpr6.cxx
+++ b/sc/source/core/tool/interpr6.cxx
@@ -26,6 +26,8 @@
 #include "mtvcellfunc.hxx"
 #include "scmatrix.hxx"
 
+#include "arraysumfunctor.hxx"
+
 #include <formula/token.hxx>
 
 using namespace formula;
@@ -235,18 +237,9 @@ public:
                 if (nDataSize == 0)
                     return;
 
-                size_t nUnrolled = (nDataSize & 0x3) >> 2;
+                sc::ArraySumFunctor functor(p, nDataSize);
 
-                // Try to encourage the compiler/CPU to do something sensible for the next.
-                for (i = 0; i < nUnrolled; i+=4)
-                {
-                    mfRest += p[i];
-                    mfRest += p[i+1];
-                    mfRest += p[i+2];
-                    mfRest += p[i+3];
-                }
-                for (; i < nDataSize; ++i)
-                    mfRest += p[i];
+                mfRest += functor();
                 break;
             }
 
diff --git a/tools/Library_tl.mk b/tools/Library_tl.mk
index 2d105cd..65ba17c 100644
--- a/tools/Library_tl.mk
+++ b/tools/Library_tl.mk
@@ -69,6 +69,7 @@ $(eval $(call gb_Library_add_exception_objects,tl,\
     tools/source/memtools/multisel \
     tools/source/memtools/unqidx \
     tools/source/misc/appendunixshellword \
+    tools/source/misc/cpuid \
     tools/source/misc/extendapplicationenvironment \
     tools/source/misc/getprocessworkingdir \
     tools/source/misc/solarmutex \
diff --git a/tools/source/misc/cpuid.cxx b/tools/source/misc/cpuid.cxx
new file mode 100644
index 0000000..b4406be
--- /dev/null
+++ b/tools/source/misc/cpuid.cxx
@@ -0,0 +1,56 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include <tools/cpuid.hxx>
+#include <cstdint>
+
+namespace tools
+{
+namespace cpuid
+{
+
+// First minimize to MSVC / GCC compat. compiler and x86 / x64 architecture
+#if (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
+
+namespace
+{
+#if defined(_MSC_VER)
+#include <intrin.h>
+static void getCpuId(uint32_t array[4])
+{
+    __cpuid((int*)array, 1);
+}
+#else
+#include <cpuid.h>
+static void getCpuId(uint32_t array[4])
+{
+    __get_cpuid(1, array + 0, array + 1, array + 2, array + 3);
+}
+#endif
+}
+
+bool hasSSE2()
+{
+    uint32_t cpuInfoArray[] = {0, 0, 0, 0};
+    getCpuId(cpuInfoArray);
+    return (cpuInfoArray[3] & (1 << 26)) != 0;
+}
+
+#else
+
+bool hasSSE2() { return false; }
+
+#endif
+
+}
+}
+
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */


More information about the Libreoffice-commits mailing list