[Libreoffice-commits] core.git: Branch 'feature/fixes11' - sc/source
Tomaž Vajngerl
tomaz.vajngerl at collabora.com
Wed Nov 4 15:37:21 PST 2015
sc/source/core/inc/arraysumfunctor.hxx | 58 ++++++++++++++++++++++-----------
1 file changed, 39 insertions(+), 19 deletions(-)
New commits:
commit f814b00bc908c5498156194f45bf8f9c0b8268ac
Author: Tomaž Vajngerl <tomaz.vajngerl at collabora.com>
Date: Thu Nov 5 00:32:44 2015 +0100
Fast array sum: aligned load, process 8 doubles per loop
* isAligned checks if a pointer address is aligned to required
number of bytes.
* Process the array until we are aligned..
Change-Id: Id42a8a2628b2797f7870ec8cd29a183087f9911e
diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx
index 9e4ce97..fc1b915 100644
--- a/sc/source/core/inc/arraysumfunctor.hxx
+++ b/sc/source/core/inc/arraysumfunctor.hxx
@@ -17,6 +17,12 @@
namespace sc
{
+template<typename T, unsigned int N>
+inline bool isAligned(const T* pointer)
+{
+ return 0 == (uintptr_t(pointer) % N);
+}
+
struct ArraySumFunctor
{
private:
@@ -30,18 +36,26 @@ public:
{
}
- double operator() () const
+ double operator() ()
{
static bool hasSSE2 = tools::cpuid::hasSSE2();
printf("SSE used %d\n", hasSSE2);
double fSum = 0.0;
size_t i = 0;
+ const double* pCurrent = mpArray;
if (hasSSE2)
- fSum += executeSSE2(i);
+ {
+ while (!isAligned<double, 16>(pCurrent))
+ {
+ fSum += *pCurrent++;
+ i++;
+ }
+ fSum += executeSSE2(i, pCurrent);
+ }
else
- fSum += executeUnrolled(i);
+ fSum += executeUnrolled(i, pCurrent);
// sum rest of the array
@@ -52,27 +66,34 @@ public:
}
private:
- inline double executeSSE2(size_t& i) const
+ inline double executeSSE2(size_t& i, const double* pCurrent) const
{
double fSum = 0.0;
- size_t nUnrolledSize = mnSize - (mnSize % 4);
+ size_t nRealSize = mnSize - i;
+ size_t nUnrolledSize = nRealSize - (nRealSize % 8);
if (nUnrolledSize > 0)
{
- register __m128d sum1 = _mm_set_pd(0.0, 0.0);
- register __m128d sum2 = _mm_set_pd(0.0, 0.0);
+ __m128d sum1 = _mm_setzero_pd();
+ __m128d sum2 = _mm_setzero_pd();
+ __m128d sum3 = _mm_setzero_pd();
+ __m128d sum4 = _mm_setzero_pd();
- const double* pCurrent = mpArray;
-
- for (; i < nUnrolledSize; i += 4)
+ for (; i < nUnrolledSize; i += 8)
{
- sum1 = _mm_add_pd(sum1, _mm_loadu_pd(pCurrent));
- pCurrent += 2;
+ __m128d load1 = _mm_load_pd(&pCurrent[i]);
+ sum1 = _mm_add_pd(sum1, load1);
+
+ __m128d load2 = _mm_load_pd(&pCurrent[i + 2]);
+ sum2 = _mm_add_pd(sum2, load2);
- sum2 = _mm_add_pd(sum2, _mm_loadu_pd(pCurrent));
- pCurrent += 2;
+ __m128d load3 = _mm_load_pd(&pCurrent[i + 4]);
+ sum3 = _mm_add_pd(sum3, load3);
+
+ __m128d load4 = _mm_load_pd(&pCurrent[i + 6]);
+ sum4 = _mm_add_pd(sum4, load4);
}
- sum1 = _mm_add_pd(sum1, sum2);
+ sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));
double temp;
@@ -85,9 +106,10 @@ private:
return fSum;
}
- inline double executeUnrolled(size_t& i) const
+ inline double executeUnrolled(size_t& i, const double* pCurrent) const
{
- size_t nUnrolledSize = mnSize - (mnSize % 4);
+ size_t nRealSize = mnSize - i;
+ size_t nUnrolledSize = nRealSize - (nRealSize % 4);
if (nUnrolledSize > 0)
{
@@ -96,8 +118,6 @@ private:
double sum2 = 0.0;
double sum3 = 0.0;
- const double* pCurrent = mpArray;
-
for (; i < nUnrolledSize; i += 4)
{
sum0 += *pCurrent++;
More information about the Libreoffice-commits
mailing list