[Mesa-dev] [PATCH] vc4: Optimizing vc4_load_utile/vc4_store_utile with sse for x86 build

Mon Apr 10 17:40:12 UTC 2017

From: Maxim Maslov <maslov at eltechs.com>

---
 src/gallium/drivers/vc4/vc4_tiling_lt.c | 93 +++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c
index c9cbc65..d291262 100644
--- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -105,6 +105,49 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
+#elif defined(USE_SSE_ASM)
+        if (gpu_stride == 8) {
+                __asm__ volatile (
+                        "movdqu 0(%1), %%xmm0;"
+                        "movdqu 0x10(%1), %%xmm1;"
+                        "movdqu 0x20(%1), %%xmm2;"
+                        "movdqu 0x30(%1), %%xmm3;"
+                        "movlpd %%xmm0, 0(%0);"
+                        "mov %2, %%ecx;"
+                        "movhpd %%xmm0, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movlpd %%xmm1, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movhpd %%xmm1, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movlpd %%xmm2, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movhpd %%xmm2, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movlpd %%xmm3, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movhpd %%xmm3, 0(%0,%%ecx,1);"
+                        :
+                        : "r"(cpu), "r"(gpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        } else {
+                assert(gpu_stride == 16);
+                __asm__ volatile (
+                        "movdqu 0(%1), %%xmm0;"
+                        "movdqu 0x10(%1), %%xmm1;"
+                        "movdqu 0x20(%1), %%xmm2;"
+                        "movdqu 0x30(%1), %%xmm3;"
+                        "movdqu %%xmm0, 0(%0);"
+                        "mov %2, %%ecx;"
+                        "movdqu %%xmm1, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movdqu %%xmm2, 0(%0,%%ecx,1);"
+                        "add %2, %%ecx;"
+                        "movdqu %%xmm3, 0(%0,%%ecx,1);"
+                        :
+                        : "r"(cpu), "r"(gpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        }
 #else
         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
                 memcpy(cpu, gpu + gpu_offset, gpu_stride);
@@ -160,13 +203,55 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
+#elif defined(USE_SSE_ASM)
+        if (gpu_stride == 8) {
+                __asm__ volatile (
+                        "movlpd 0(%1), %%xmm0;"
+                        "mov %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm0;"
+                        "add %2, %%ecx;"
+                        "movlpd 0(%1,%%ecx,1), %%xmm1;"
+                        "add %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm1;"
+                        "add %2, %%ecx;"
+                        "movlpd 0(%1,%%ecx,1), %%xmm2;"
+                        "add %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm2;"
+                        "add %2, %%ecx;"
+                        "movlpd 0(%1,%%ecx,1), %%xmm3;"
+                        "add %2, %%ecx;"
+                        "movhpd 0(%1,%%ecx,1), %%xmm3;"
+                        "movdqu %%xmm0, 0(%0);"
+                        "movdqu %%xmm1, 0x10(%0);"
+                        "movdqu %%xmm2, 0x20(%0);"
+                        "movdqu %%xmm3, 0x30(%0);"
+                        :
+                        : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        } else {
+                assert(gpu_stride == 16);
+                __asm__ volatile (
+                       "movdqu 0(%1), %%xmm0;"
+                       "mov %2, %%ecx;"
+                       "movdqu 0(%1,%%ecx,1), %%xmm1;"
+                       "add %2, %%ecx;"
+                       "movdqu 0(%1,%%ecx,1), %%xmm2;"
+                       "add %2, %%ecx;"
+                       "movdqu 0(%1,%%ecx,1), %%xmm3;"
+                       "movdqu %%xmm0, 0(%0);"
+                       "movdqu %%xmm1, 0x10(%0);"
+                       "movdqu %%xmm2, 0x20(%0);"
+                       "movdqu %%xmm3, 0x30(%0);"
+                       :
+                       : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+                       : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
+        }
 #else
         for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
                 memcpy(gpu + gpu_offset, cpu, gpu_stride);
                 cpu += cpu_stride;
         }
 #endif
-
 }
 
 void
@@ -175,6 +260,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
                             int cpp, const struct pipe_box *box)
 {
         uint32_t utile_w = vc4_utile_width(cpp);
+        uint32_t xfactor = 64 / utile_w;
         uint32_t utile_h = vc4_utile_height(cpp);
         uint32_t xstart = box->x;
         uint32_t ystart = box->y;
@@ -184,7 +270,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
                         vc4_load_utile(dst + (dst_stride * y +
                                               x * cpp),
                                        src + ((ystart + y) * src_stride +
-                                              (xstart + x) * 64 / utile_w),
+                                              (xstart + x) * xfactor),
                                        dst_stride, cpp);
                 }
         }
@@ -196,6 +282,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
                              int cpp, const struct pipe_box *box)
 {
         uint32_t utile_w = vc4_utile_width(cpp);
+        uint32_t xfactor = 64 / utile_w;
         uint32_t utile_h = vc4_utile_height(cpp);
         uint32_t xstart = box->x;
         uint32_t ystart = box->y;
@@ -203,7 +290,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
         for (uint32_t y = 0; y < box->height; y += utile_h) {
                 for (int x = 0; x < box->width; x += utile_w) {
                         vc4_store_utile(dst + ((ystart + y) * dst_stride +
-                                               (xstart + x) * 64 / utile_w),
+                                               (xstart + x) * xfactor),
                                         src + (src_stride * y +
                                                x * cpp),
                                         src_stride, cpp);
-- 
2.7.4