<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Mar 4, 2015 at 4:19 PM, Matt Turner <span dir="ltr"><<a href="mailto:mattst88@gmail.com" target="_blank">mattst88@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">The SSSE3 swizzling code was written for fast uploads to the GPU and<br>
assumed the destination was always 16-byte aligned. When we began using<br>
this code for fast downloads as well we didn't do anything to account<br>
for the fact that the destination pointer given by glReadPixels() or<br>
glGetTexImage() is not guaranteed to be suitably aligned.<br>
<br>
With SSSE3 enabled (at compile-time), some applications would crash when<br>
an SSE aligned-store instruction tried to store to an unaligned<br>
destination (or an assertion that the destination is aligned would<br>
trigger).<br>
<br>
To remedy this, tell intel_get_memcpy() whether we're uploading or<br>
downloading so that it can select whether to assume the destination or<br>
source is aligned, respectively.<br>
<br>
Cc: 10.5 <<a href="mailto:mesa-stable@lists.freedesktop.org">mesa-stable@lists.freedesktop.org</a>><br>
Bugzilla: <a href="https://bugs.freedesktop.org/show_bug.cgi?id=89416" target="_blank">https://bugs.freedesktop.org/show_bug.cgi?id=89416</a><br>
---<br>
I don't know what benchmarks I should run to test this code?<br>
<br>
 src/mesa/drivers/dri/i965/intel_pixel_read.c   |   3 +-<br>
 src/mesa/drivers/dri/i965/intel_tex_image.c    |   3 +-<br>
 src/mesa/drivers/dri/i965/intel_tex_subimage.c |   3 +-<br>
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 156 +++++++++++++++++++------<br>
 src/mesa/drivers/dri/i965/intel_tiled_memcpy.h |  15 ++-<br>
 5 files changed, 138 insertions(+), 42 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c<br>
index df22a63..0972121 100644<br>
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c<br>
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c<br>
@@ -139,7 +139,8 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,<br>
        rb->Format == MESA_FORMAT_R8G8B8X8_UNORM)<br>
       return false;<br>
<br>
-   if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp))<br>
+   if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp,<br>
+                         INTEL_DOWNLOAD))<br>
       return false;<br>
<br>
    if (!irb->mt ||<br>
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c<br>
index da42fdd..a3312d2 100644<br>
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c<br>
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c<br>
@@ -408,7 +408,8 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,<br>
        texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM)<br>
       return false;<br>
<br>
-   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp))<br>
+   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp,<br>
+                         INTEL_UPLOAD))<br></blockquote><div><br></div><div>Should be INTEL_DOWNLOAD<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
       return false;<br>
<br>
    /* If this is a nontrivial texture view, let another path handle it instead. */<br>
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c<br>
index 4262f71..909ff25 100644<br>
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c<br>
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c<br>
@@ -118,7 +118,8 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,<br>
        packing->Invert)<br>
       return false;<br>
<br>
-   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp))<br>
+   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp,<br>
+                         INTEL_UPLOAD))<br>
       return false;<br>
<br>
    /* If this is a nontrivial texture view, let another path handle it instead. */<br>
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c<br>
index f2b35cb..c43383b 100644<br>
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c<br>
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c<br>
@@ -60,42 +60,79 @@ static const uint32_t ytile_span = 16;<br>
 static const uint8_t rgba8_permutation[16] =<br>
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };<br>
<br>
-/* NOTE: dst must be 16 byte aligned */<br>
-#define rgba8_copy_16(dst, src)                     \<br>
-   *(__m128i *)(dst) = _mm_shuffle_epi8(            \<br>
-      (__m128i) _mm_loadu_ps((float *)(src)),       \<br>
-      *(__m128i *) rgba8_permutation                \<br>
-   )<br>
+/* NOTE: dst must be 16-byte aligned. src may be unaligned. */<br>
+#define rgba8_copy_16_aligned_dst(dst, src)                            \<br>
+   _mm_store_si128((__m128i *)(dst),                                   \<br>
+                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \<br>
+                                    *(__m128i *) rgba8_permutation))<br>
+<br>
+/* NOTE: src must be 16-byte aligned. dst may be unaligned. */<br>
+#define rgba8_copy_16_aligned_src(dst, src)                            \<br>
+   _mm_storeu_si128((__m128i *)(dst),                                  \<br>
+                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \<br>
+                                     *(__m128i *) rgba8_permutation))<br>
 #endif<br>
<br>
 /**<br>
- * Copy RGBA to BGRA - swap R and B.<br>
+ * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.<br>
  */<br>
 static inline void *<br>
-rgba8_copy(void *dst, const void *src, size_t bytes)<br>
+rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)<br>
 {<br>
    uint8_t *d = dst;<br>
    uint8_t const *s = src;<br>
<br>
 #ifdef __SSSE3__<br>
-   /* Fast copying for tile spans.<br>
-    *<br>
-    * As long as the destination texture is 16 aligned,<br>
-    * any 16 or 64 spans we get here should also be 16 aligned.<br>
-    */<br>
-<br>
    if (bytes == 16) {<br>
       assert(!(((uintptr_t)dst) & 0xf));<br>
-      rgba8_copy_16(d+ 0, s+ 0);<br>
+      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);<br>
       return dst;<br>
    }<br>
<br>
    if (bytes == 64) {<br>
       assert(!(((uintptr_t)dst) & 0xf));<br>
-      rgba8_copy_16(d+ 0, s+ 0);<br>
-      rgba8_copy_16(d+16, s+16);<br>
-      rgba8_copy_16(d+32, s+32);<br>
-      rgba8_copy_16(d+48, s+48);<br>
+      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);<br>
+      rgba8_copy_16_aligned_dst(d+16, s+16);<br>
+      rgba8_copy_16_aligned_dst(d+32, s+32);<br>
+      rgba8_copy_16_aligned_dst(d+48, s+48);<br>
+      return dst;<br>
+   }<br>
+#endif<br>
+<br>
+   while (bytes >= 4) {<br>
+      d[0] = s[2];<br>
+      d[1] = s[1];<br>
+      d[2] = s[0];<br>
+      d[3] = s[3];<br>
+      d += 4;<br>
+      s += 4;<br>
+      bytes -= 4;<br>
+   }<br>
+   return dst;<br>
+}<br>
+<br>
+/**<br>
+ * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.<br>
+ */<br>
+static inline void *<br>
+rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)<br>
+{<br>
+   uint8_t *d = dst;<br>
+   uint8_t const *s = src;<br>
+<br>
+#ifdef __SSSE3__<br>
+   if (bytes == 16) {<br>
+      assert(!(((uintptr_t)src) & 0xf));<br>
+      rgba8_copy_16_aligned_src(d+ 0, s+ 0);<br>
+      return dst;<br>
+   }<br>
+<br>
+   if (bytes == 64) {<br>
+      assert(!(((uintptr_t)src) & 0xf));<br>
+      rgba8_copy_16_aligned_src(d+ 0, s+ 0);<br>
+      rgba8_copy_16_aligned_src(d+16, s+16);<br>
+      rgba8_copy_16_aligned_src(d+32, s+32);<br>
+      rgba8_copy_16_aligned_src(d+48, s+48);<br>
       return dst;<br>
    }<br>
 #endif<br>
@@ -357,16 +394,26 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
       if (mem_copy == memcpy)<br>
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,<br>
                                  dst, src, src_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
+         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,<br>
-                                 dst, src, src_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br></blockquote><div><br></div><div>linear_to_xtiled will always be called with an aligned destination so you only need the one case.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
    } else {<br>
       if (mem_copy == memcpy)<br>
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,<br>
                                  dst, src, src_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,<br>
-                                 dst, src, src_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
+         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br>
    }<br>
    linear_to_xtiled(x0, x1, x2, x3, y0, y1,<br>
                     dst, src, src_pitch, swizzle_bit, mem_copy);<br>
@@ -393,16 +440,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
       if (mem_copy == memcpy)<br>
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,<br>
                                  dst, src, src_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
+         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,<br>
-                                 dst, src, src_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br></blockquote><div><br></div><div>Same for linear_to_ytiled<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
    } else {<br>
       if (mem_copy == memcpy)<br>
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,<br>
                                  dst, src, src_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,<br>
-                                 dst, src, src_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
+         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,<br>
+                                 dst, src, src_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br>
    }<br>
    linear_to_ytiled(x0, x1, x2, x3, y0, y1,<br>
                     dst, src, src_pitch, swizzle_bit, mem_copy);<br>
@@ -429,16 +486,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
       if (mem_copy == memcpy)<br>
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,<br>
                                  dst, src, dst_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
+         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,<br>
-                                 dst, src, dst_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br>
    } else {<br>
       if (mem_copy == memcpy)<br>
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
                                  dst, src, dst_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
+         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
-                                 dst, src, dst_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src); </blockquote><div><br></div><div>xtiled_to_linear will only ever be called with an aligned source<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
    }<br>
    xtiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
                     dst, src, dst_pitch, swizzle_bit, mem_copy);<br>
@@ -465,16 +532,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,<br>
       if (mem_copy == memcpy)<br>
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,<br>
                                  dst, src, dst_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,<br>
-                                 dst, src, dst_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
+         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br></blockquote><div><br></div><div>same for ytiled_to_linear.<br><br></div><div>Kill off the uneeded cases and fix the typo and this patch is<br><br></div><div>Reviewed-by: Jason Ekstrand <<a href="mailto:jason.ekstrand@intel.com">jason.ekstrand@intel.com</a>><br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
    } else {<br>
       if (mem_copy == memcpy)<br>
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
                                  dst, src, dst_pitch, swizzle_bit, memcpy);<br>
-      else if (mem_copy == rgba8_copy)<br>
+      else if (mem_copy == rgba8_copy_aligned_dst)<br>
+         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_dst);<br>
+      else if (mem_copy == rgba8_copy_aligned_src)<br>
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
-                                 dst, src, dst_pitch, swizzle_bit, rgba8_copy);<br>
+                                 dst, src, dst_pitch, swizzle_bit,<br>
+                                 rgba8_copy_aligned_src);<br>
    }<br>
    ytiled_to_linear(x0, x1, x2, x3, y0, y1,<br>
                     dst, src, dst_pitch, swizzle_bit, mem_copy);<br>
@@ -684,7 +761,8 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,<br>
  * \return true if the format and type combination are valid<br>
  */<br>
 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,<br>
-                      GLenum type, mem_copy_fn* mem_copy, uint32_t* cpp)<br>
+                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,<br>
+                      enum intel_memcpy_direction direction)<br>
 {<br>
    if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&<br>
        !(format == GL_RGBA || format == GL_BGRA))<br>
@@ -700,7 +778,8 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,<br>
       if (format == GL_BGRA) {<br>
          *mem_copy = memcpy;<br>
       } else if (format == GL_RGBA) {<br>
-         *mem_copy = rgba8_copy;<br>
+         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst<br>
+                                               : rgba8_copy_aligned_src;<br>
       }<br>
    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||<br>
               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) {<br>
@@ -709,7 +788,8 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,<br>
          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can<br>
           * use the same function.<br>
           */<br>
-         *mem_copy = rgba8_copy;<br>
+         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst<br>
+                                               : rgba8_copy_aligned_src;<br>
       } else if (format == GL_RGBA) {<br>
          *mem_copy = memcpy;<br>
       }<br>
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h<br>
index 3ff0d71..9dc1088 100644<br>
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h<br>
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h<br>
@@ -55,7 +55,20 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,<br>
                 uint32_t tiling,<br>
                 mem_copy_fn mem_copy);<br>
<br>
+/* Tells intel_get_memcpy() whether the memcpy() is<br>
+ *<br>
+ *  - an upload to the GPU with an aligned destination and a potentially<br>
+ *    unaligned source; or<br>
+ *  - a download from the GPU with an aligned source and a potentially<br>
+ *    unaligned destination.<br>
+ */<br>
+enum intel_memcpy_direction {<br>
+   INTEL_UPLOAD,<br>
+   INTEL_DOWNLOAD<br>
+};<br>
+<br>
 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,<br>
-                      GLenum type, mem_copy_fn* mem_copy, uint32_t* cpp);<br>
+                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,<br>
+                      enum intel_memcpy_direction direction);<br>
<br>
 #endif /* INTEL_TILED_MEMCPY */<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.0.5<br>
<br>
</font></span></blockquote></div><br></div></div>