[Mesa-dev] [PATCH] vc4: Optimizing vc4_load_utile/vc4_store_utile with sse for x86 build

Eric Anholt eric at anholt.net
Mon Apr 10 17:56:11 UTC 2017


maslov at eltechs.com writes:

> From: Maxim Maslov <maslov at eltechs.com>

The commit message needs some explanation of why we would want that
(given that 2835 is an ARM) and some performance data justifying the
change.

>
> --- src/gallium/drivers/vc4/vc4_tiling_lt.c | 93
>+++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 3
>deletions(-)
>
> diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c
> index c9cbc65..d291262 100644
> --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
> +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
> @@ -105,6 +105,49 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
>                          : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
>                          : "q0", "q1", "q2", "q3");
>          }
> +#elif defined(USE_SSE_ASM)
> +        if (gpu_stride == 8) {
> +                __asm__ volatile (
> +                        "movdqu 0(%1), %%xmm0;"
> +                        "movdqu 0x10(%1), %%xmm1;"
> +                        "movdqu 0x20(%1), %%xmm2;"
> +                        "movdqu 0x30(%1), %%xmm3;"
> +                        "movlpd %%xmm0, 0(%0);"
> +                        "mov %2, %%ecx;"
> +                        "movhpd %%xmm0, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movlpd %%xmm1, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movhpd %%xmm1, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movlpd %%xmm2, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movhpd %%xmm2, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movlpd %%xmm3, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movhpd %%xmm3, 0(%0,%%ecx,1);"
> +                        :
> +                        : "r"(cpu), "r"(gpu), "r"(cpu_stride)
> +                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
> +        } else {
> +                assert(gpu_stride == 16);
> +                __asm__ volatile (
> +                        "movdqu 0(%1), %%xmm0;"
> +                        "movdqu 0x10(%1), %%xmm1;"
> +                        "movdqu 0x20(%1), %%xmm2;"
> +                        "movdqu 0x30(%1), %%xmm3;"
> +                        "movdqu %%xmm0, 0(%0);"
> +                        "mov %2, %%ecx;"
> +                        "movdqu %%xmm1, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movdqu %%xmm2, 0(%0,%%ecx,1);"
> +                        "add %2, %%ecx;"
> +                        "movdqu %%xmm3, 0(%0,%%ecx,1);"
> +                        :
> +                        : "r"(cpu), "r"(gpu), "r"(cpu_stride)
> +                        : "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", "%ecx");
> +        }

Using SSE in Mesa requires runtime detection if SSE is actually present.


>  #endif
> -
>  }
>  
>  void
> @@ -175,6 +260,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
>                              int cpp, const struct pipe_box *box)
>  {
>          uint32_t utile_w = vc4_utile_width(cpp);
> +        uint32_t xfactor = 64 / utile_w;
>          uint32_t utile_h = vc4_utile_height(cpp);
>          uint32_t xstart = box->x;
>          uint32_t ystart = box->y;
> @@ -184,7 +270,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride,
>                          vc4_load_utile(dst + (dst_stride * y +
>                                                x * cpp),
>                                         src + ((ystart + y) * src_stride +
> -                                              (xstart + x) * 64 / utile_w),
> +                                              (xstart + x) * xfactor),
>                                         dst_stride, cpp);
>                  }
>          }
> @@ -196,6 +282,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
>                               int cpp, const struct pipe_box *box)
>  {
>          uint32_t utile_w = vc4_utile_width(cpp);
> +        uint32_t xfactor = 64 / utile_w;
>          uint32_t utile_h = vc4_utile_height(cpp);
>          uint32_t xstart = box->x;
>          uint32_t ystart = box->y;
> @@ -203,7 +290,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride,
>          for (uint32_t y = 0; y < box->height; y += utile_h) {
>                  for (int x = 0; x < box->width; x += utile_w) {
>                          vc4_store_utile(dst + ((ystart + y) * dst_stride +
> -                                               (xstart + x) * 64 / utile_w),
> +                                               (xstart + x) * xfactor),
>                                          src + (src_stride * y +
>                                                 x * cpp),
>                                          src_stride, cpp);
> -- 
> 2.7.4

Unrelated changes should be a separate commit.

(I would expect that this change doesn't do anything, because the
compiler moves the math out of the loop anyway).
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170410/6956b66d/attachment.sig>


More information about the mesa-dev mailing list