[Mesa-dev] [PATCH 2/3][RFC v2] mesa/main/x86: Add sse2 streaming clamping

Roland Scheidegger sroland at vmware.com
Wed Nov 5 06:24:52 PST 2014


Am 05.11.2014 um 10:13 schrieb Juha-Pekka Heikkila:
> On 04.11.2014 23:24, Roland Scheidegger wrote:
>> Am 04.11.2014 um 13:05 schrieb Juha-Pekka Heikkila:
>>> Signed-off-by: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
>>> ---
>>>  src/mesa/Makefile.am              |   8 +++
>>>  src/mesa/main/x86/sse2_clamping.c | 103 ++++++++++++++++++++++++++++++++++++++
>>>  src/mesa/main/x86/sse2_clamping.h |  49 ++++++++++++++++++
>>>  3 files changed, 160 insertions(+)
>>>  create mode 100644 src/mesa/main/x86/sse2_clamping.c
>>>  create mode 100644 src/mesa/main/x86/sse2_clamping.h
>>>
>>> diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
>>> index e71bccb..5d3c6f5 100644
>>> --- a/src/mesa/Makefile.am
>>> +++ b/src/mesa/Makefile.am
>>> @@ -111,6 +111,10 @@ if SSE41_SUPPORTED
>>>  ARCH_LIBS += libmesa_sse41.la
>>>  endif
>>>  
>>> +if SSE2_SUPPORTED
>>> +ARCH_LIBS += libmesa_sse2.la
>>> +endif
>>> +
>>>  MESA_ASM_FILES_FOR_ARCH =
>>>  
>>>  if HAVE_X86_ASM
>>> @@ -154,6 +158,10 @@ libmesa_sse41_la_SOURCES = \
>>>  	main/streaming-load-memcpy.c
>>>  libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
>>>  
>>> +libmesa_sse2_la_SOURCES = \
>>> +	main/x86/sse2_clamping.c
>>> +libmesa_sse2_la_CFLAGS = $(AM_CFLAGS) -msse2
>>> +
>>>  pkgconfigdir = $(libdir)/pkgconfig
>>>  pkgconfig_DATA = gl.pc
>>>  
>>> diff --git a/src/mesa/main/x86/sse2_clamping.c b/src/mesa/main/x86/sse2_clamping.c
>>> new file mode 100644
>>> index 0000000..7df1c85
>>> --- /dev/null
>>> +++ b/src/mesa/main/x86/sse2_clamping.c
>>> @@ -0,0 +1,103 @@
>>> +/*
>>> + * Copyright © 2014 Intel Corporation
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice (including the next
>>> + * paragraph) shall be included in all copies or substantial portions of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + *
>>> + * Authors:
>>> + *    Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
>>> + *
>>> + */
>>> +
>>> +#ifdef __SSE2__
>>> +#include "main/macros.h"
>>> +#include "main/x86/sse2_clamping.h"
>>> +#include <emmintrin.h>
>>> +
>>> +/**
>>> + * Clamp four float values to [min,max]
>>> + */
>>> +static inline void
>>> +_mesa_clamp_float_rgba(GLfloat src[4], GLfloat result[4], const float min,
>>> +                       const float max)
>>> +{
>>> +   __m128  operand, minval, maxval;
>>> +
>>> +   operand = _mm_loadu_ps(src);
>>> +   minval = _mm_set1_ps(min);
>>> +   maxval = _mm_set1_ps(max);
>>> +   operand = _mm_max_ps(operand, minval);
>>> +   operand = _mm_min_ps(operand, maxval);
>>> +   _mm_storeu_ps(result, operand);
>>> +}
>>> +
>>> +
>>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2
>>> + */
>>> +void
>>> +_mesa_streaming_clamp_float_rgba(const GLuint n, GLfloat rgba_src[][4],
>>> +                                 GLfloat rgba_dst[][4], const GLfloat min,
>>> +                                 const GLfloat max)
>>> +{
>>> +   int i;
>>> +
>>> +   for (i = 0; i < n; i++) {
>>> +      _mesa_clamp_float_rgba(rgba_src[i], rgba_dst[i], min, max);
>>> +   }
>>> +}
>>> +
>>> +
>>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 and apply
>>> + * scaling and mapping to components.
>>> + *
>>> + * this replace handling of [RGBA] channels:
>>> + * rgba_temp[RCOMP] = CLAMP(rgba[i][RCOMP], 0.0F, 1.0F);
>>> + * rgba[i][RCOMP] = rMap[F_TO_I(rgba_temp[RCOMP] * scale[RCOMP])];
>>> + */
>>> +void
>>> +_mesa_clamp_float_rgba_scale_and_map(const GLuint n, GLfloat rgba_src[][4],
>>> +                                     GLfloat rgba_dst[][4], const GLfloat min,
>>> +                                     const GLfloat max,
>>> +                                     const GLfloat scale[4],
>>> +                                     const GLfloat* rMap, const GLfloat* gMap,
>>> +                                     const GLfloat* bMap, const GLfloat* aMap)
>>> +{
>>> +   int i;
>>> +   GLfloat __attribute__((aligned(16))) temp[4];
>>> +   __m128  *operand = (__m128*) &temp, multiplier, mmove;
>>> +   __m128i truncated_integers;
>>> +
>>> +   const unsigned int* map_p = (const unsigned int*) &truncated_integers;
>>> +
>>> +   multiplier = _mm_loadu_ps(scale);
>>> +
>>> +   for(i = 0; i < n; i++) {
>>> +      _mesa_clamp_float_rgba(rgba_src[i], temp, min, max);
>>> +
>>> +      *operand = _mm_mul_ps(multiplier, *operand);
>>> +      truncated_integers = _mm_cvttps_epi32(*operand);
>>> +      mmove = _mm_set_ps(aMap[map_p[ACOMP]], bMap[map_p[BCOMP]],
>>> +                         gMap[map_p[GCOMP]], rMap[map_p[RCOMP]] );
>>> +
>>> +      _mm_storeu_ps(rgba_dst[i], mmove);
>> The sse2 code at the end looks counterproductive to me. Not sure what
>> gcc will generate but I'd suspect it involves some simd->int domain
>> transition for the table lookups, plus another int->simd transition to
>> get the values back into simd domain (alternatively it might use
>> stores/load here) just so you can store them again...
>> It would probably be better to just store the values directly after the
>> table lookups.
> 
> I did at first try have this tail piece of code as simple c code as
> possible but what gcc did was not so nice. This piece is compiled with
> sse2 flag and what gcc though of this was to start moving elements
> around one by one using movss. "mmove = _mm_set_ps.." looks quite bad
> but the code it create is not so crappy in the end, at least when
> compare to what gcc originally did.
I guess it would work better if you'd use an explicit simd store instead
of casting the __m128i to ints in this case. But it doesn't really
matter in the end.

Roland


> 
>> But in any case actually I'm beginning to suspect noone really cares
>> about performance anyway for that path (who the hell uses these
>> scale/map features?) so whatever works...
> 
> I agree, this function is of low interest. Clamping is the issue here
> which deserves to be fixed, to get _mesa_streaming_clamp_float_rgba
> function reasonable as it is used often.
> 
> There is additional nuance to this particular file which I mentioned in
> the cover-letter. Note where this file is stored. There was discussion
> in the previous rfc patch about where to store all these optimizations
> targeting particular architecture, this is my suggestion.
> 
> /Juha-Pekka
> 
>>
>>
>>> +   }
>>> +}
>>> +
>>> +
>>> +#endif /* __SSE2__ */
>>> diff --git a/src/mesa/main/x86/sse2_clamping.h b/src/mesa/main/x86/sse2_clamping.h
>>> new file mode 100644
>>> index 0000000..688fab7
>>> --- /dev/null
>>> +++ b/src/mesa/main/x86/sse2_clamping.h
>>> @@ -0,0 +1,49 @@
>>> +/*
>>> + * Copyright © 2014 Intel Corporation
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice (including the next
>>> + * paragraph) shall be included in all copies or substantial portions of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + *
>>> + * Authors:
>>> + *    Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
>>> + *
>>> + */
>>> +
>>> +#ifdef __SSE2__
>>> +
>>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2
>>> + */
>>> +void
>>> +_mesa_streaming_clamp_float_rgba(const GLuint n, GLfloat rgba_src[][4],
>>> +                                 GLfloat rgba_dst[][4], const GLfloat min,
>>> +                                 const GLfloat max);
>>> +
>>> +
>>> +/* Clamp n amount float rgba pixels to [min,max] using SSE2 and apply
>>> + * scaling and mapping to components.
>>> + */
>>> +void
>>> +_mesa_clamp_float_rgba_scale_and_map(const GLuint n, GLfloat rgba_src[][4],
>>> +                                     GLfloat rgba_dst[][4], const GLfloat min,
>>> +                                     const GLfloat max,
>>> +                                     const GLfloat scale[4],
>>> +                                     const GLfloat* rMap, const GLfloat* gMap,
>>> +                                     const GLfloat* bMap, const GLfloat* aMap);
>>> +
>>> +#endif /* __SSE2__ */
>>>
>>
> 



More information about the mesa-dev mailing list