[Libreoffice-commits] core.git: Branch 'feature/fixes11' - sc/source

Tomaž Vajngerl tomaz.vajngerl at collabora.com
Wed Nov 4 15:37:21 PST 2015


 sc/source/core/inc/arraysumfunctor.hxx |   58 ++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 19 deletions(-)

New commits:
commit f814b00bc908c5498156194f45bf8f9c0b8268ac
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.com>
Date:   Thu Nov 5 00:32:44 2015 +0100

    Fast array sum: aligned load, process 8 doubles per loop
    
    * isAligned checks if a pointer address is aligned to required
    number of bytes.
    
    * Process the array until we are aligned..
    
    Change-Id: Id42a8a2628b2797f7870ec8cd29a183087f9911e

diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx
index 9e4ce97..fc1b915 100644
--- a/sc/source/core/inc/arraysumfunctor.hxx
+++ b/sc/source/core/inc/arraysumfunctor.hxx
@@ -17,6 +17,12 @@
 namespace sc
 {
 
+template<typename T, unsigned int N>
+inline bool isAligned(const T* pointer)
+{
+    return 0 == (uintptr_t(pointer) % N);
+}
+
 struct ArraySumFunctor
 {
 private:
@@ -30,18 +36,26 @@ public:
     {
     }
 
-    double operator() () const
+    double operator() ()
     {
         static bool hasSSE2 = tools::cpuid::hasSSE2();
         printf("SSE used %d\n", hasSSE2);
 
         double fSum = 0.0;
         size_t i = 0;
+        const double* pCurrent = mpArray;
 
         if (hasSSE2)
-            fSum += executeSSE2(i);
+        {
+            while (!isAligned<double, 16>(pCurrent))
+            {
+                fSum += *pCurrent++;
+                i++;
+            }
+            fSum += executeSSE2(i, pCurrent);
+        }
         else
-            fSum += executeUnrolled(i);
+            fSum += executeUnrolled(i, pCurrent);
 
         // sum rest of the array
 
@@ -52,27 +66,34 @@ public:
     }
 
 private:
-    inline double executeSSE2(size_t& i) const
+    inline double executeSSE2(size_t& i, const double* pCurrent) const
     {
         double fSum = 0.0;
-        size_t nUnrolledSize = mnSize - (mnSize % 4);
+        size_t nRealSize = mnSize - i;
+        size_t nUnrolledSize = nRealSize - (nRealSize % 8);
 
         if (nUnrolledSize > 0)
         {
-            register __m128d sum1 = _mm_set_pd(0.0, 0.0);
-            register __m128d sum2 = _mm_set_pd(0.0, 0.0);
+            __m128d sum1 = _mm_setzero_pd();
+            __m128d sum2 = _mm_setzero_pd();
+            __m128d sum3 = _mm_setzero_pd();
+            __m128d sum4 = _mm_setzero_pd();
 
-            const double* pCurrent = mpArray;
-
-            for (; i < nUnrolledSize; i += 4)
+            for (; i < nUnrolledSize; i += 8)
             {
-                sum1 = _mm_add_pd(sum1, _mm_loadu_pd(pCurrent));
-                pCurrent += 2;
+                __m128d load1 = _mm_load_pd(&pCurrent[i]);
+                sum1 = _mm_add_pd(sum1, load1);
+
+                __m128d load2 = _mm_load_pd(&pCurrent[i + 2]);
+                sum2 = _mm_add_pd(sum2, load2);
 
-                sum2 = _mm_add_pd(sum2, _mm_loadu_pd(pCurrent));
-                pCurrent += 2;
+                __m128d load3 = _mm_load_pd(&pCurrent[i + 4]);
+                sum3 = _mm_add_pd(sum3, load3);
+
+                __m128d load4 = _mm_load_pd(&pCurrent[i + 6]);
+                sum4 = _mm_add_pd(sum4, load4);
             }
-            sum1 = _mm_add_pd(sum1, sum2);
+            sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));
 
             double temp;
 
@@ -85,9 +106,10 @@ private:
         return fSum;
     }
 
-    inline double executeUnrolled(size_t& i) const
+    inline double executeUnrolled(size_t& i, const double* pCurrent) const
     {
-        size_t nUnrolledSize = mnSize - (mnSize % 4);
+        size_t nRealSize = mnSize - i;
+        size_t nUnrolledSize = nRealSize - (nRealSize % 4);
 
         if (nUnrolledSize > 0)
         {
@@ -96,8 +118,6 @@ private:
             double sum2 = 0.0;
             double sum3 = 0.0;
 
-            const double* pCurrent = mpArray;
-
             for (; i < nUnrolledSize; i += 4)
             {
                 sum0 += *pCurrent++;


More information about the Libreoffice-commits mailing list