[Mesa-dev] [PATCH 7/7] broadcom/vc4: Optimize vc4_load_utile/vc4_store_utile with sse for x86

Eric Anholt eric at anholt.net
Tue Aug 8 20:19:52 UTC 2017


From: Maxim Maslov <maslov at eltechs.com>

Mesa vc4 drivers can be built for x86 in order to run hw accelerated
graphics inside virtual machines (QEMU, Exagear) on Raspberry Pi.

Improves playing intro videos on Diablo II inside Exagear to take 11% of
CPU instead of 20%.

v2: Runtime CPU detection by Maxim
v3: Fix up cross-compiling and make runtime CPU detection match NEON's, by
    anholt.
---
 src/gallium/drivers/vc4/Makefile.am     | 11 ++++
 src/gallium/drivers/vc4/vc4_tiling.h    | 20 +++++++
 src/gallium/drivers/vc4/vc4_tiling_lt.c | 96 ++++++++++++++++++++++++++++++++-
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 4c2b7486c522..1dd57c5bcd7e 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -51,6 +51,17 @@ libvc4_neon_la_SOURCES = $(NEON_C_SOURCES)
 libvc4_neon_la_CFLAGS = $(AM_CFLAGS) -mfpu=neon
 endif
 
+if HAVE_X86_64_ASM
+# Disable SSE build on x86-64, which also has HAVE_X86_ASM set.
+else
+if HAVE_X86_ASM
+noinst_LTLIBRARIES += libvc4_sse.la
+libvc4_la_LIBADD += libvc4_sse.la
+libvc4_sse_la_SOURCES = vc4_tiling_lt.c
+libvc4_sse_la_CFLAGS = $(AM_CFLAGS) -msse -DVC4_BUILD_SSE
+endif
+endif
+
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
 
 EXTRA_DIST = kernel/README
diff --git a/src/gallium/drivers/vc4/vc4_tiling.h b/src/gallium/drivers/vc4/vc4_tiling.h
index 66767e7f1f83..7360ec1a9bca 100644
--- a/src/gallium/drivers/vc4/vc4_tiling.h
+++ b/src/gallium/drivers/vc4/vc4_tiling.h
@@ -75,6 +75,12 @@ void vc4_load_lt_image_neon(void *dst, uint32_t dst_stride,
 void vc4_store_lt_image_neon(void *dst, uint32_t dst_stride,
                              void *src, uint32_t src_stride,
                              int cpp, const struct pipe_box *box);
+void vc4_load_lt_image_sse(void *dst, uint32_t dst_stride,
+                           void *src, uint32_t src_stride,
+                           int cpp, const struct pipe_box *box);
+void vc4_store_lt_image_sse(void *dst, uint32_t dst_stride,
+                            void *src, uint32_t src_stride,
+                            int cpp, const struct pipe_box *box);
 void vc4_load_tiled_image(void *dst, uint32_t dst_stride,
                           void *src, uint32_t src_stride,
                           uint8_t tiling_format, int cpp,
@@ -96,6 +102,13 @@ vc4_load_lt_image(void *dst, uint32_t dst_stride,
                 return;
         }
 #endif
+#ifdef USE_X86_ASM
+        if (util_cpu_caps.has_sse2) {
+                vc4_load_lt_image_sse(dst, dst_stride, src, src_stride,
+                                      cpp, box);
+                return;
+        }
+#endif
         vc4_load_lt_image_base(dst, dst_stride, src, src_stride,
                                cpp, box);
 }
@@ -112,6 +125,13 @@ vc4_store_lt_image(void *dst, uint32_t dst_stride,
                 return;
         }
 #endif
+#ifdef USE_X86_ASM
+        if (util_cpu_caps.has_sse) {
+                vc4_store_lt_image_sse(dst, dst_stride, src, src_stride,
+                                       cpp, box);
+                return;
+        }
+#endif
 
         vc4_store_lt_image_base(dst, dst_stride, src, src_stride,
                                 cpp, box);
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c
index 29328bf0d332..865a4629f416 100644
--- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -34,9 +34,12 @@
 #include <string.h>
 #include "pipe/p_state.h"
 #include "vc4_tiling.h"
+#include "util/u_cpu_detect.h"
 
 #ifdef VC4_BUILD_NEON
 #define NEON_TAG(x) x ## _neon
+#elif defined(VC4_BUILD_SSE)
+#define NEON_TAG(x) x ## _sse
 #else
 #define NEON_TAG(x) x ## _base
 #endif
@@ -149,7 +152,53 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
+#elif defined(VC4_BUILD_SSE)
+        if (gpu_stride == 8) {
+                __asm__ volatile (
+                        "movdqu 0(%1), %%xmm0;"
+                        "movdqu 0x10(%1), %%xmm1;"
+                        "movdqu 0x20(%1), %%xmm2;"
+                        "movdqu 0x30(%1), %%xmm3;"
+                        "movlpd %%xmm0, 0(%0);"
+                        "mov %2, %%ecx;"
+                        "movhpd %%xmm0, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movlpd %%xmm1, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movhpd %%xmm1, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movlpd %%xmm2, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movhpd %%xmm2, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movlpd %%xmm3, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movhpd %%xmm3, 0(%0,%%ecx,1);"
+                        :
+                        : "r"(cpu), "r"(gpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        } else {
+                assert(gpu_stride == 16);
+                __asm__ volatile (
+                        "movdqu 0(%1), %%xmm0;"
+                        "movdqu 0x10(%1), %%xmm1;"
+                        "movdqu 0x20(%1), %%xmm2;"
+                        "movdqu 0x30(%1), %%xmm3;"
+                        "movdqu %%xmm0, 0(%0);"
+                        "mov %2, %%ecx;"
+                        "movdqu %%xmm1, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movdqu %%xmm2, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movdqu %%xmm3, 0(%0,%%ecx,1);"
+                        :
+                        : "r"(cpu), "r"(gpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        }
 #else
+        /* This generic loop runs only if we don't have SIMD acceleration for
+         * this CPU.
+         */
         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
                 cpu += cpu_stride;
@@ -244,13 +293,58 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
+#elif defined(VC4_BUILD_SSE)
+        if (gpu_stride == 8) {
+                __asm__ volatile (
+                        "movlpd 0(%1), %%xmm0;"
+                        "mov %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm0;"
+                        "add %2, %%ecx;"
+                        "movlpd 0(%1,%%ecx,1), %%xmm1;"
+                        "add %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm1;"
+                        "add %2, %%ecx;"
+                        "movlpd 0(%1,%%ecx,1), %%xmm2;"
+                        "add %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm2;"
+                        "add %2, %%ecx;"
+                        "movlpd 0(%1,%%ecx,1), %%xmm3;"
+                        "add %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm3;"
+                        "movdqu %%xmm0, 0(%0);"
+                        "movdqu %%xmm1, 0x10(%0);"
+                        "movdqu %%xmm2, 0x20(%0);"
+                        "movdqu %%xmm3, 0x30(%0);"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        } else {
+                assert(gpu_stride == 16);
+                __asm__ volatile (
+                        "movdqu 0(%1), %%xmm0;"
+                        "mov %2, %%ecx;"
+                        "movdqu 0(%1,%%ecx,1), %%xmm1;"
+                        "add %2, %%ecx;"
+                        "movdqu 0(%1,%%ecx,1), %%xmm2;"
+                        "add %2, %%ecx;"
+                        "movdqu 0(%1,%%ecx,1), %%xmm3;"
+                        "movdqu %%xmm0, 0(%0);"
+                        "movdqu %%xmm1, 0x10(%0);"
+                        "movdqu %%xmm2, 0x20(%0);"
+                        "movdqu %%xmm3, 0x30(%0);"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        }
 #else
+        /* This generic loop runs only if we don't have SIMD acceleration for
+         * this CPU.
+         */
         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
                 cpu += cpu_stride;
         }
 #endif
-
 }
 
 void
-- 
2.13.3



More information about the mesa-dev mailing list