[Mesa-dev] [PATCH RFC] mesa: add SSE optimisation for glDrawElements
Timothy Arceri
t_arceri at yahoo.com.au
Wed Oct 22 21:02:00 PDT 2014
Makes use of SSE to speed up compute of min and max elements
Callgrind cpu usage results from pts benchmarks:
Openarena 0.8.8: 3.67% -> 1.03%
UrbanTerror: 2.36% -> 0.81%
Signed-off-by: Timothy Arceri <t_arceri at yahoo.com.au>
---
src/mesa/Makefile.am | 3 +-
src/mesa/main/sse_minmax.c | 75 +++++++++++++++++++++++++++++++++++++++++++
src/mesa/main/sse_minmax.h | 29 +++++++++++++++++
src/mesa/vbo/vbo_exec_array.c | 8 +++++
4 files changed, 114 insertions(+), 1 deletion(-)
create mode 100644 src/mesa/main/sse_minmax.c
create mode 100644 src/mesa/main/sse_minmax.h
I almost wasn't going to bother sending this out since it uses SSE4.1
and its recommended to use glDrawRangeElements anyway. But since these games
are still ofter used for benchmarking I thought I'd see if anyone is
interested in this. I only optimised GL_UNSIGNED_INT as that was the
only place these games were hitting but I guess it wouldn't hurt
to optimse the other cases too.
I think it would probably make sense too to just combine
streaming-load-memcpy.c and sse_minmax.c into a single file
something like sse_opt.c for example.
As far a frame rates go I couldn't see any concrete improments
on my Ivybridge machine. Maybe it would help more on a high end radeon
card where cpu apparently is more of a concern??
Finally its seems to run fine but its only the second time I've tried
these type of optimisations so let me know if there are any obvious
mistakes or improvements.
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index e71bccb..932db4f 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -151,7 +151,8 @@ libmesagallium_la_LIBADD = \
$(ARCH_LIBS)
libmesa_sse41_la_SOURCES = \
- main/streaming-load-memcpy.c
+ main/streaming-load-memcpy.c \
+ main/sse_minmax.c
libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
pkgconfigdir = $(libdir)/pkgconfig
diff --git a/src/mesa/main/sse_minmax.c b/src/mesa/main/sse_minmax.c
new file mode 100644
index 0000000..1625407
--- /dev/null
+++ b/src/mesa/main/sse_minmax.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2014 Timothy Arceri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Author:
+ * Timothy Arceri <t_arceri at yahoo.com.au>
+ *
+ */
+
+#ifdef __SSE4_1__
+#include "main/glheader.h"
+#include "main/sse_minmax.h"
+#include <smmintrin.h>
+
+void
+sse_minmax(const GLuint *ui_indices, GLuint *min_index, GLuint *max_index, const GLuint count)
+{
+ GLuint i = 0;
+ GLuint max_ui = 0;
+ GLuint min_ui = ~0U;
+ GLuint max_arr[4] = {0};
+ GLuint min_arr[4] = {0};
+ GLuint vec_count;
+ __m128i max_ui4 = _mm_setzero_si128();
+ __m128i min_ui4 = _mm_set1_epi32(~0U);
+ __m128i ui_indices4;
+ __m128i *ui_indices_ptr;
+
+ if (count >= 4) {
+ vec_count = count - (count % 4);
+ ui_indices_ptr = (__m128i*)ui_indices;
+ for (i = 0; i < vec_count/4; i++) {
+ ui_indices4 = _mm_loadu_si128(&ui_indices_ptr[i]);
+ max_ui4 = _mm_max_epu32(ui_indices4, max_ui4);
+ min_ui4 = _mm_min_epu32(ui_indices4, min_ui4);
+ }
+
+ _mm_store_ps((float*)max_arr, _mm_castsi128_ps(max_ui4));
+ _mm_store_ps((float*)min_arr, _mm_castsi128_ps(min_ui4));
+
+ for (i = 0; i < 4; i++) {
+ if (max_arr[i] > max_ui) max_ui = max_arr[i];
+ if (min_arr[i] < min_ui) min_ui = min_arr[i];
+ }
+ i = vec_count;
+ }
+
+ for (; i < count; i++) {
+ if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
+ if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
+ }
+
+ *min_index = min_ui;
+ *max_index = max_ui;
+}
+
+#endif
diff --git a/src/mesa/main/sse_minmax.h b/src/mesa/main/sse_minmax.h
new file mode 100644
index 0000000..d14db26
--- /dev/null
+++ b/src/mesa/main/sse_minmax.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright © 2014 Timothy Arceri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ * Timothy Arceri <t_arceri at yahoo.com.au>
+ *
+ */
+
+void
+sse_minmax(const GLuint *ui_indices, GLuint *min_index, GLuint *max_index, const GLuint count);
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 045dbb5..0399348 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -36,6 +36,9 @@
#include "main/enums.h"
#include "main/macros.h"
#include "main/transformfeedback.h"
+#ifdef __SSE4_1__
+#include "main/sse_minmax.h"
+#endif
#include "vbo_context.h"
@@ -110,6 +113,7 @@ vbo_get_minmax_index(struct gl_context *ctx,
const GLuint *ui_indices = (const GLuint *)indices;
GLuint max_ui = 0;
GLuint min_ui = ~0U;
+
if (restart) {
for (i = 0; i < count; i++) {
if (ui_indices[i] != restartIndex) {
@@ -119,10 +123,14 @@ vbo_get_minmax_index(struct gl_context *ctx,
}
}
else {
+#ifdef __SSE4_1__
+ sse_minmax(ui_indices, &min_ui, &max_ui, count);
+#else
for (i = 0; i < count; i++) {
if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
}
+#endif
}
*min_index = min_ui;
*max_index = max_ui;
--
1.9.3
More information about the mesa-dev
mailing list