[Mesa-dev] [PATCH V2] mesa: add SSE optimisation for glDrawElements

Roland Scheidegger sroland at vmware.com
Fri Oct 24 16:00:55 PDT 2014


Am 24.10.2014 um 23:06 schrieb Ian Romanick:
> On 10/24/2014 05:47 AM, Timothy Arceri wrote:
>> Makes use of SSE to speed up compute of min and max elements
>>
>> Callgrind cpu usage results from pts benchmarks:
>>
>> Openarena 0.8.8: 3.67% -> 1.03%
>> UrbanTerror: 2.36% -> 0.81%
>>
>> Signed-off-by: Timothy Arceri <t_arceri at yahoo.com.au>
>> ---
>>  src/mesa/Android.libmesa_dricore.mk |  3 +-
>>  src/mesa/Makefile.am                |  3 +-
>>  src/mesa/Makefile.sources           |  1 +
>>  src/mesa/main/sse_minmax.c          | 81 +++++++++++++++++++++++++++++++++++++
>>  src/mesa/main/sse_minmax.h          | 30 ++++++++++++++
>>  src/mesa/vbo/vbo_exec_array.c       | 13 ++++--
>>  6 files changed, 126 insertions(+), 5 deletions(-)
>>  create mode 100644 src/mesa/main/sse_minmax.c
>>  create mode 100644 src/mesa/main/sse_minmax.h
>>
>> This version includes all the suggestions from Brian and Matt, thanks for
>> the review guys.
>>
>> I haven't been able to do Matt's suggestion and compare this to what OpenMP
>> would generate as I only have one machine that supports SSE4.1 with Fedora 20 and
>> I dont want to have to upgrade to Fedora 21 alpha (gcc 4.9) just to test this
>> (although I did consider it). If people are happy with this code I will revisit
>> OpenMP for Mesa 10.5 and will look at using OpenMP for the short and byte support too.
>>
>> diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk
>> index 1e6d948..52d626f 100644
>> --- a/src/mesa/Android.libmesa_dricore.mk
>> +++ b/src/mesa/Android.libmesa_dricore.mk
>> @@ -51,7 +51,8 @@ endif # MESA_ENABLE_ASM
>>  
>>  ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
>>  LOCAL_SRC_FILES += \
>> -	$(SRCDIR)main/streaming-load-memcpy.c
>> +	$(SRCDIR)main/streaming-load-memcpy.c \
>> +	$(SRCDIR)main/sse_minmax.c
>>  LOCAL_CFLAGS := -msse4.1
>>  endif
>>  
>> diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
>> index e71bccb..932db4f 100644
>> --- a/src/mesa/Makefile.am
>> +++ b/src/mesa/Makefile.am
>> @@ -151,7 +151,8 @@ libmesagallium_la_LIBADD = \
>>  	$(ARCH_LIBS)
>>  
>>  libmesa_sse41_la_SOURCES = \
>> -	main/streaming-load-memcpy.c
>> +	main/streaming-load-memcpy.c \
>> +	main/sse_minmax.c
>>  libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
>>  
>>  pkgconfigdir = $(libdir)/pkgconfig
>> diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
>> index 4755018..dd10574 100644
>> --- a/src/mesa/Makefile.sources
>> +++ b/src/mesa/Makefile.sources
>> @@ -93,6 +93,7 @@ MAIN_FILES = \
>>  	$(SRCDIR)main/shaderobj.c \
>>  	$(SRCDIR)main/shader_query.cpp \
>>  	$(SRCDIR)main/shared.c \
>> +	$(SRCDIR)main/sse_minmax.c \
>>  	$(SRCDIR)main/state.c \
>>  	$(SRCDIR)main/stencil.c \
>>  	$(SRCDIR)main/syncobj.c \
>> diff --git a/src/mesa/main/sse_minmax.c b/src/mesa/main/sse_minmax.c
>> new file mode 100644
>> index 0000000..577f44e
>> --- /dev/null
>> +++ b/src/mesa/main/sse_minmax.c
>> @@ -0,0 +1,81 @@
>> +/*
>> + * Copyright © 2014 Timothy Arceri
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Author:
>> + *    Timothy Arceri <t_arceri at yahoo.com.au>
>> + *
>> + */
>> +
>> +#ifdef __SSE4_1__
>> +#include "main/glheader.h"
>> +#include "main/sse_minmax.h"
>> +#include <smmintrin.h>
>> +
>> +void
>> +_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
>> +                         unsigned *max_index, const unsigned count)
>> +{
>> +   unsigned i = 0;
>> +   unsigned max_ui = 0;
>> +   unsigned min_ui = ~0U;
>> +
>> +   if (count >= 4) {
>> +      unsigned max_arr[4] __attribute__ ((aligned (16)));
>> +      unsigned min_arr[4] __attribute__ ((aligned (16)));
>> +      unsigned vec_count;
>> +      __m128i max_ui4 = _mm_setzero_si128();
>> +      __m128i min_ui4 = _mm_set1_epi32(~0U);
>> +      __m128i ui_indices4;
>> +      __m128i *ui_indices_ptr;
>> +
>> +      vec_count = count & ~0x3;
>> +      ui_indices_ptr = (__m128i*)ui_indices;
>> +      for (i = 0; i < vec_count / 4; i++) {
>> +         ui_indices4 = _mm_loadu_si128(&ui_indices_ptr[i]);
> 
> How does this fare with unaligned data?  My recollection is that
> _mm_loadu_si128 could be quite a bit slower than _mm_load_si128.  It
> might be worth handling the first few values without SSE until the
> pointer is aligned.
> 
> Or my memory might be wrong.

IIRC  even for unaligned memory movdqu doesn't have much of a
performance penalty on newer Core cpus (sandy bridge and newer?) - only
some when actually crossing cache line boundaries, but not too severe
(though the optimization guides don't tell how large). There's probably
lots of cpus out there though where the penalty could be quite large so
using aligned loads might be a good idea.

Roland

>> +         max_ui4 = _mm_max_epu32(ui_indices4, max_ui4);
>> +         min_ui4 = _mm_min_epu32(ui_indices4, min_ui4);
>> +      }
>> +
>> +      _mm_store_si128((__m128i*)max_arr, max_ui4);
>> +      _mm_store_si128((__m128i*)min_arr, min_ui4);
>> +
>> +      for (i = 0; i < 4; i++) {
>> +         if (max_arr[i] > max_ui)
>> +            max_ui = max_arr[i];
>> +         if (min_arr[i] < min_ui)
>> +            min_ui = min_arr[i];
>> +      }
>> +      i = vec_count;
>> +   }
>> +
>> +   for (; i < count; i++) {
>> +      if (ui_indices[i] > max_ui)
>> +         max_ui = ui_indices[i];
>> +      if (ui_indices[i] < min_ui)
>> +         min_ui = ui_indices[i];
>> +   }
>> +
>> +   *min_index = min_ui;
>> +   *max_index = max_ui;
>> +}
>> +
>> +#endif
>> diff --git a/src/mesa/main/sse_minmax.h b/src/mesa/main/sse_minmax.h
>> new file mode 100644
>> index 0000000..953c4e9
>> --- /dev/null
>> +++ b/src/mesa/main/sse_minmax.h
>> @@ -0,0 +1,30 @@
>> +/*
>> + * Copyright © 2014 Timothy Arceri
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Author:
>> + *    Timothy Arceri <t_arceri at yahoo.com.au>
>> + *
>> + */
>> +
>> +void
>> +_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
>> +                         unsigned *max_index, const unsigned count);
>> diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
>> index 045dbb5..f857da4 100644
>> --- a/src/mesa/vbo/vbo_exec_array.c
>> +++ b/src/mesa/vbo/vbo_exec_array.c
>> @@ -36,6 +36,8 @@
>>  #include "main/enums.h"
>>  #include "main/macros.h"
>>  #include "main/transformfeedback.h"
>> +#include "main/sse_minmax.h"
>> +#include "x86/common_x86_asm.h"
>>  
>>  #include "vbo_context.h"
>>  
>> @@ -119,9 +121,14 @@ vbo_get_minmax_index(struct gl_context *ctx,
>>           }
>>        }
>>        else {
>> -         for (i = 0; i < count; i++) {
>> -            if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
>> -            if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
>> +         if (cpu_has_sse4_1) {
>> +            _mesa_uint_array_min_max(ui_indices, &min_ui, &max_ui, count);
>> +         }
>> +         else {
>> +            for (i = 0; i < count; i++) {
>> +               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
>> +               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
>> +            }
>>           }
>>        }
>>        *min_index = min_ui;
>>
>>
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=F4msKE2WxRzA%2BwN%2B25muztFm5TSPwE8HKJfWfR2NgfY%3D%0A&m=LwBPXKJD2HNZ%2Be%2BmFAxLkmbWJ%2B7B0CRueVIXxSuOmv0%3D%0A&s=34e091c64218e9636106b996236961e26c46480a5064ca945475272cdaf818c8
>>
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=F4msKE2WxRzA%2BwN%2B25muztFm5TSPwE8HKJfWfR2NgfY%3D%0A&m=LwBPXKJD2HNZ%2Be%2BmFAxLkmbWJ%2B7B0CRueVIXxSuOmv0%3D%0A&s=34e091c64218e9636106b996236961e26c46480a5064ca945475272cdaf818c8
> 



More information about the mesa-dev mailing list