[Mesa-dev] [PATCH] i965: Tell intel_get_memcpy() which direction the memcpy() is going.

Wed Mar 4 16:28:11 PST 2015

On Wed, Mar 4, 2015 at 4:19 PM, Matt Turner <mattst88 at gmail.com> wrote:

> The SSSE3 swizzling code was written for fast uploads to the GPU and
> assumed the destination was always 16-byte aligned. When we began using
> this code for fast downloads as well we didn't do anything to account
> for the fact that the destination pointer given by glReadPixels() or
> glGetTexImage() is not guaranteed to be suitably aligned.
>
> With SSSE3 enabled (at compile-time), some applications would crash when
> an SSE aligned-store instruction tried to store to an unaligned
> destination (or an assertion that the destination is aligned would
> trigger).
>
> To remedy this, tell intel_get_memcpy() whether we're uploading or
> downloading so that it can select whether to assume the destination or
> source is aligned, respectively.
>
> Cc: 10.5 <mesa-stable at lists.freedesktop.org>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89416
> ---
> I don't know what benchmarks I should run to test this code?
>
>  src/mesa/drivers/dri/i965/intel_pixel_read.c   |   3 +-
>  src/mesa/drivers/dri/i965/intel_tex_image.c    |   3 +-
>  src/mesa/drivers/dri/i965/intel_tex_subimage.c |   3 +-
>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 156
> +++++++++++++++++++------
>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.h |  15 ++-
>  5 files changed, 138 insertions(+), 42 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c
> b/src/mesa/drivers/dri/i965/intel_pixel_read.c
> index df22a63..0972121 100644
> --- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
> +++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
> @@ -139,7 +139,8 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
>         rb->Format == MESA_FORMAT_R8G8B8X8_UNORM)
>        return false;
>
> -   if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp))
> +   if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp,
> +                         INTEL_DOWNLOAD))
>        return false;
>
>     if (!irb->mt ||
> diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c
> b/src/mesa/drivers/dri/i965/intel_tex_image.c
> index da42fdd..a3312d2 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex_image.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
> @@ -408,7 +408,8 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context
> *ctx,
>         texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM)
>        return false;
>
> -   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy,
> &cpp))
> +   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy,
> &cpp,
> +                         INTEL_UPLOAD))
>

Should be INTEL_DOWNLOAD

>        return false;
>
>     /* If this is a nontrivial texture view, let another path handle it
> instead. */
> diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> index 4262f71..909ff25 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
> @@ -118,7 +118,8 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
>         packing->Invert)
>        return false;
>
> -   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy,
> &cpp))
> +   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy,
> &cpp,
> +                         INTEL_UPLOAD))
>        return false;
>
>     /* If this is a nontrivial texture view, let another path handle it
> instead. */
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index f2b35cb..c43383b 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -60,42 +60,79 @@ static const uint32_t ytile_span = 16;
>  static const uint8_t rgba8_permutation[16] =
>     { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
>
> -/* NOTE: dst must be 16 byte aligned */
> -#define rgba8_copy_16(dst, src)                     \
> -   *(__m128i *)(dst) = _mm_shuffle_epi8(            \
> -      (__m128i) _mm_loadu_ps((float *)(src)),       \
> -      *(__m128i *) rgba8_permutation                \
> -   )
> +/* NOTE: dst must be 16-byte aligned. src may be unaligned. */
> +#define rgba8_copy_16_aligned_dst(dst, src)                            \
> +   _mm_store_si128((__m128i *)(dst),                                   \
> +                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
> +                                    *(__m128i *) rgba8_permutation))
> +
> +/* NOTE: src must be 16-byte aligned. dst may be unaligned. */
> +#define rgba8_copy_16_aligned_src(dst, src)                            \
> +   _mm_storeu_si128((__m128i *)(dst),                                  \
> +                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
> +                                     *(__m128i *) rgba8_permutation))
>  #endif
>
>  /**
> - * Copy RGBA to BGRA - swap R and B.
> + * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
>   */
>  static inline void *
> -rgba8_copy(void *dst, const void *src, size_t bytes)
> +rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
>  {
>     uint8_t *d = dst;
>     uint8_t const *s = src;
>
>  #ifdef __SSSE3__
> -   /* Fast copying for tile spans.
> -    *
> -    * As long as the destination texture is 16 aligned,
> -    * any 16 or 64 spans we get here should also be 16 aligned.
> -    */
> -
>     if (bytes == 16) {
>        assert(!(((uintptr_t)dst) & 0xf));
> -      rgba8_copy_16(d+ 0, s+ 0);
> +      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
>        return dst;
>     }
>
>     if (bytes == 64) {
>        assert(!(((uintptr_t)dst) & 0xf));
> -      rgba8_copy_16(d+ 0, s+ 0);
> -      rgba8_copy_16(d+16, s+16);
> -      rgba8_copy_16(d+32, s+32);
> -      rgba8_copy_16(d+48, s+48);
> +      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
> +      rgba8_copy_16_aligned_dst(d+16, s+16);
> +      rgba8_copy_16_aligned_dst(d+32, s+32);
> +      rgba8_copy_16_aligned_dst(d+48, s+48);
> +      return dst;
> +   }
> +#endif
> +
> +   while (bytes >= 4) {
> +      d[0] = s[2];
> +      d[1] = s[1];
> +      d[2] = s[0];
> +      d[3] = s[3];
> +      d += 4;
> +      s += 4;
> +      bytes -= 4;
> +   }
> +   return dst;
> +}
> +
> +/**
> + * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
> + */
> +static inline void *
> +rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
> +{
> +   uint8_t *d = dst;
> +   uint8_t const *s = src;
> +
> +#ifdef __SSSE3__
> +   if (bytes == 16) {
> +      assert(!(((uintptr_t)src) & 0xf));
> +      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
> +      return dst;
> +   }
> +
> +   if (bytes == 64) {
> +      assert(!(((uintptr_t)src) & 0xf));
> +      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
> +      rgba8_copy_16_aligned_src(d+16, s+16);
> +      rgba8_copy_16_aligned_src(d+32, s+32);
> +      rgba8_copy_16_aligned_src(d+48, s+48);
>        return dst;
>     }
>  #endif
> @@ -357,16 +394,26 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1,
> uint32_t x2, uint32_t x3,
>        if (mem_copy == memcpy)
>           return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0,
> xtile_height,
>                                   dst, src, src_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
> +         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0,
> xtile_height,
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
>           return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0,
> xtile_height,
> -                                 dst, src, src_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>

linear_to_xtiled will always be called with an aligned destination so you
only need the one case.

>     } else {
>        if (mem_copy == memcpy)
>           return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
>                                   dst, src, src_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
>           return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
> -                                 dst, src, src_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
> +         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>     }
>     linear_to_xtiled(x0, x1, x2, x3, y0, y1,
>                      dst, src, src_pitch, swizzle_bit, mem_copy);
> @@ -393,16 +440,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1,
> uint32_t x2, uint32_t x3,
>        if (mem_copy == memcpy)
>           return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0,
> ytile_height,
>                                   dst, src, src_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
> +         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0,
> ytile_height,
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
>           return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0,
> ytile_height,
> -                                 dst, src, src_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>

Same for linear_to_ytiled

>     } else {
>        if (mem_copy == memcpy)
>           return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
>                                   dst, src, src_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
>           return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
> -                                 dst, src, src_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
> +         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
> +                                 dst, src, src_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>     }
>     linear_to_ytiled(x0, x1, x2, x3, y0, y1,
>                      dst, src, src_pitch, swizzle_bit, mem_copy);
> @@ -429,16 +486,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1,
> uint32_t x2, uint32_t x3,
>        if (mem_copy == memcpy)
>           return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0,
> xtile_height,
>                                   dst, src, dst_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
> +         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0,
> xtile_height,
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
>           return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0,
> xtile_height,
> -                                 dst, src, dst_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>     } else {
>        if (mem_copy == memcpy)
>           return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
>                                   dst, src, dst_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
> +         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
>           return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
> -                                 dst, src, dst_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);

xtiled_to_linear will only ever be called with an aligned source

>     }
>     xtiled_to_linear(x0, x1, x2, x3, y0, y1,
>                      dst, src, dst_pitch, swizzle_bit, mem_copy);
> @@ -465,16 +532,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1,
> uint32_t x2, uint32_t x3,
>        if (mem_copy == memcpy)
>           return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0,
> ytile_height,
>                                   dst, src, dst_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
>           return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0,
> ytile_height,
> -                                 dst, src, dst_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
> +         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0,
> ytile_height,
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>

same for ytiled_to_linear.

Kill off the uneeded cases and fix the typo and this patch is

Reviewed-by: Jason Ekstrand <jason.ekstrand at intel.com>

>     } else {
>        if (mem_copy == memcpy)
>           return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
>                                   dst, src, dst_pitch, swizzle_bit,
> memcpy);
> -      else if (mem_copy == rgba8_copy)
> +      else if (mem_copy == rgba8_copy_aligned_dst)
> +         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_dst);
> +      else if (mem_copy == rgba8_copy_aligned_src)
>           return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
> -                                 dst, src, dst_pitch, swizzle_bit,
> rgba8_copy);
> +                                 dst, src, dst_pitch, swizzle_bit,
> +                                 rgba8_copy_aligned_src);
>     }
>     ytiled_to_linear(x0, x1, x2, x3, y0, y1,
>                      dst, src, dst_pitch, swizzle_bit, mem_copy);
> @@ -684,7 +761,8 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
>   * \return true if the format and type combination are valid
>   */
>  bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
> -                      GLenum type, mem_copy_fn* mem_copy, uint32_t* cpp)
> +                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,
> +                      enum intel_memcpy_direction direction)
>  {
>     if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
>         !(format == GL_RGBA || format == GL_BGRA))
> @@ -700,7 +778,8 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum
> format,
>        if (format == GL_BGRA) {
>           *mem_copy = memcpy;
>        } else if (format == GL_RGBA) {
> -         *mem_copy = rgba8_copy;
> +         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
> +                                               : rgba8_copy_aligned_src;
>        }
>     } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
>                (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {
> @@ -709,7 +788,8 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum
> format,
>           /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we
> can
>            * use the same function.
>            */
> -         *mem_copy = rgba8_copy;
> +         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
> +                                               : rgba8_copy_aligned_src;
>        } else if (format == GL_RGBA) {
>           *mem_copy = memcpy;
>        }
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> index 3ff0d71..9dc1088 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
> @@ -55,7 +55,20 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
>                  uint32_t tiling,
>                  mem_copy_fn mem_copy);
>
> +/* Tells intel_get_memcpy() whether the memcpy() is
> + *
> + *  - an upload to the GPU with an aligned destination and a potentially
> + *    unaligned source; or
> + *  - a download from the GPU with an aligned source and a potentially
> + *    unaligned destination.
> + */
> +enum intel_memcpy_direction {
> +   INTEL_UPLOAD,
> +   INTEL_DOWNLOAD
> +};
> +
>  bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
> -                      GLenum type, mem_copy_fn* mem_copy, uint32_t* cpp);
> +                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,
> +                      enum intel_memcpy_direction direction);
>
>  #endif /* INTEL_TILED_MEMCPY */
> --
> 2.0.5
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/mesa-dev/attachments/20150304/8412b6e5/attachment-0001.html>