[Mesa-dev] [PATCH 2/3] mesa/main: Add sse2 streaming clamping

Wed Nov 12 11:47:58 PST 2014

On 12.11.2014 19:36, Bruno Jimenez wrote:
> On Wed, 2014-11-12 at 14:50 +0200, Juha-Pekka Heikkila wrote:
>> Signed-off-by: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
>> ---
>>  src/mesa/Makefile.am          |   8 +++
>>  src/mesa/main/sse2_clamping.c | 138 ++++++++++++++++++++++++++++++++++++++++++
>>  src/mesa/main/sse2_clamping.h |  49 +++++++++++++++
>>  3 files changed, 195 insertions(+)
>>  create mode 100644 src/mesa/main/sse2_clamping.c
>>  create mode 100644 src/mesa/main/sse2_clamping.h
>>
>> diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
>> index 932db4f..43dbe87 100644
>> --- a/src/mesa/Makefile.am
>> +++ b/src/mesa/Makefile.am
>> @@ -111,6 +111,10 @@ if SSE41_SUPPORTED
>>  ARCH_LIBS += libmesa_sse41.la
>>  endif
>>  
>> +if SSE2_SUPPORTED
>> +ARCH_LIBS += libmesa_sse2.la
>> +endif
>> +
>>  MESA_ASM_FILES_FOR_ARCH =
>>  
>>  if HAVE_X86_ASM
>> @@ -155,6 +159,10 @@ libmesa_sse41_la_SOURCES = \
>>  	main/sse_minmax.c
>>  libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
>>  
>> +libmesa_sse2_la_SOURCES = \
>> +	main/sse2_clamping.c
>> +libmesa_sse2_la_CFLAGS = $(AM_CFLAGS) -msse2
>> +
>>  pkgconfigdir = $(libdir)/pkgconfig
>>  pkgconfig_DATA = gl.pc
>>  
>> diff --git a/src/mesa/main/sse2_clamping.c b/src/mesa/main/sse2_clamping.c
>> new file mode 100644
>> index 0000000..66c7dc7
>> --- /dev/null
>> +++ b/src/mesa/main/sse2_clamping.c
>> @@ -0,0 +1,138 @@
>> +/*
>> + * Copyright © 2014 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Authors:
>> + *    Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
>> + *
>> + */
>> +
>> +#ifdef __SSE2__
>> +#include "main/macros.h"
>> +#include "main/sse2_clamping.h"
>> +#include <emmintrin.h>
>> +
>> +/**
>> + * Clamp four float values to [min,max]
>> + */
>> +static inline void
>> +_mesa_clamp_float_rgba(GLfloat src[4], GLfloat result[4], const float min,
>> +                       const float max)
>> +{
>> +   __m128  operand, minval, maxval;
>> +
>> +   operand = _mm_loadu_ps(src);
>> +   minval = _mm_set1_ps(min);
>> +   maxval = _mm_set1_ps(max);
>> +   operand = _mm_max_ps(operand, minval);
>> +   operand = _mm_min_ps(operand, maxval);
>> +   _mm_storeu_ps(result, operand);
>> +}
>> +
>> +
>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2
>> + */
>> +__attribute__((optimize("unroll-loops")))
>> +void
>> +_mesa_streaming_clamp_float_rgba(const GLuint n, GLfloat rgba_src[][4],
>> +                                 GLfloat rgba_dst[][4], const GLfloat min,
>> +                                 const GLfloat max)
>> +{
>> +   int      c, prefetch_c;
>> +   float*   worker = &rgba_src[0][0];
>> +   __m128   operand[2], minval, maxval;
>> +
>> +   _mm_prefetch((char*) (((unsigned long)worker)|0x1f) + 65, _MM_HINT_T0);
>                                                    ^^^^    ^^^
> 
> Hi,
> 
> May I ask why precisely this numbers?

0x1f as you note below is a typo, should be 0x0f. 65 is cache line
length added with one to even the |0x1f operation.

> 
>> +
>> +   minval = _mm_set1_ps(min);
>> +   maxval = _mm_set1_ps(max);
>> +
>> +   for (c = n*4; c > 0 && (((unsigned long)worker)&0x1f) != 0; c--, worker++) {
>                                                     ^^^^^
> 
> I guess that this is for alignment, but you only need to align to a 16
> bytes boundary, not 32. Or maybe I am missing something obvious.
> 

You are correct, 0x1f is typo. should be 0x0f

>> +      operand[0] = _mm_load_ss(worker);
>> +      operand[0] = _mm_max_ss(operand[0], minval);
>> +      operand[0] = _mm_min_ss(operand[0], maxval);
>> +      _mm_store_ss(worker, operand[0]);
>> +   }
>> +
>> +   while (c >= 8) {
>> +      _mm_prefetch((char*) worker + 64, _MM_HINT_T0);
>                                       ^^^
>> +
>> +      for (prefetch_c = 64/8; prefetch_c > 0 && c >= 8; prefetch_c--, c-=8,
>                           ^^^^
> 
> May I ask also why this numbers?
> 

64 is cache line length in bytes, 8 mean this loop handle 8 floats in
one go, operand[0] get 4 floats and same goes for operand[1]. I found
interleaving this way to give more performance, adding more operands did
not give any more performance thus 2 operands to work with which turn
into the number 8. Now that you asked I think 64 has to be divided by 32
and not 8 (8 elements, each 4 bytes -> 32)

Idea here is we send prefetch for next cache line before we start to
work with current line, arriving to next loop iteration we already have
new cache line ready.

> 
>> +           worker += 8) {
>> +
>> +         operand[0] = _mm_load_ps(worker);
>> +         operand[1] = _mm_load_ps(worker+4);
>> +         operand[0] = _mm_max_ps(operand[0], minval);
>> +         operand[1] = _mm_max_ps(operand[1], minval);
>> +         operand[0] = _mm_min_ps(operand[0], maxval);
>> +         operand[1] = _mm_min_ps(operand[1], maxval);
>> +
>> +         _mm_store_ps(worker, operand[0]);
>> +         _mm_store_ps(worker+4, operand[1]);
>> +      }
>> +   }
>> +
>> +   for (; c > 0; c--, worker++) {
>> +      operand[0] = _mm_load_ss(worker);
>> +      operand[0] = _mm_max_ss(operand[0], minval);
>> +      operand[0] = _mm_min_ss(operand[0], maxval);
>> +      _mm_store_ss(worker, operand[0]);
>> +   }
>> +}
>> +
>> +
>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 and apply
>> + * scaling and mapping to components.
>> + *
>> + * this replace handling of [RGBA] channels:
>> + * rgba_temp[RCOMP] = CLAMP(rgba[i][RCOMP], 0.0F, 1.0F);
>> + * rgba[i][RCOMP] = rMap[F_TO_I(rgba_temp[RCOMP] * scale[RCOMP])];
>> + */
>> +void
>> +_mesa_clamp_float_rgba_scale_and_map(const GLuint n, GLfloat rgba_src[][4],
>> +                                     GLfloat rgba_dst[][4], const GLfloat min,
>> +                                     const GLfloat max,
>> +                                     const GLfloat scale[4],
>> +                                     const GLfloat* rMap, const GLfloat* gMap,
>> +                                     const GLfloat* bMap, const GLfloat* aMap)
>> +{
>> +   int i;
>> +   GLfloat __attribute__((aligned(16))) temp[4];
>> +   __m128  *operand = (__m128*) &temp, multiplier, mmove;
>> +   __m128i truncated_integers;
>> +
>> +   const unsigned int* map_p = (const unsigned int*) &truncated_integers;
>> +
>> +   multiplier = _mm_loadu_ps(scale);
>> +
>> +   for(i = 0; i < n; i++) {
>> +      _mesa_clamp_float_rgba(rgba_src[i], temp, min, max);
>> +
>> +      *operand = _mm_mul_ps(multiplier, *operand);
>> +      truncated_integers = _mm_cvttps_epi32(*operand);
>> +      mmove = _mm_set_ps(aMap[map_p[ACOMP]], bMap[map_p[BCOMP]],
>> +                         gMap[map_p[GCOMP]], rMap[map_p[RCOMP]] );
>> +
>> +      _mm_storeu_ps(rgba_dst[i], mmove);
>> +   }
>> +}
>> +
>> +#endif /* __SSE2__ */
>> diff --git a/src/mesa/main/sse2_clamping.h b/src/mesa/main/sse2_clamping.h
>> new file mode 100644
>> index 0000000..688fab7
>> --- /dev/null
>> +++ b/src/mesa/main/sse2_clamping.h
>> @@ -0,0 +1,49 @@
>> +/*
>> + * Copyright © 2014 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Authors:
>> + *    Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
>> + *
>> + */
>> +
>> +#ifdef __SSE2__
>> +
>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2
>> + */
>> +void
>> +_mesa_streaming_clamp_float_rgba(const GLuint n, GLfloat rgba_src[][4],
>> +                                 GLfloat rgba_dst[][4], const GLfloat min,
>> +                                 const GLfloat max);
>> +
>> +
>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 and apply
>> + * scaling and mapping to components.
>> + */
>> +void
>> +_mesa_clamp_float_rgba_scale_and_map(const GLuint n, GLfloat rgba_src[][4],
>> +                                     GLfloat rgba_dst[][4], const GLfloat min,
>> +                                     const GLfloat max,
>> +                                     const GLfloat scale[4],
>> +                                     const GLfloat* rMap, const GLfloat* gMap,
>> +                                     const GLfloat* bMap, const GLfloat* aMap);
>> +
>> +#endif /* __SSE2__ */
> 
>