Mesa (master): intel: Improve teximage perf for Google Chrome paint rects ( v3)

Tue Sep 25 18:03:29 UTC 2012

Module: Mesa
Branch: master
Commit: 413c4914129cd26ca87960852d8c0264c0fb29e7
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=413c4914129cd26ca87960852d8c0264c0fb29e7

Author: Chad Versace <chad.versace at linux.intel.com>
Date:   Tue Sep  4 12:15:29 2012 -0700

intel: Improve teximage perf for Google Chrome paint rects (v3)

This patch reduces the time spent in glTexImage and glTexSubImage by
over 5x on Sandybridge for the workload described below.

It adds a new fast path for glTexImage2D and glTexSubImage2D,
intel_texsubimage_tiled_memcpy, which is optimized for Google Chrome's
paint rectangles. The fast path is implemented only for 2D GL_BGRA
textures for chipsets with a LLC.

=== Performance Analysis ===

Workload description:

    Personalize your google.com page with a wallpaper.  Start chromium
with flags "--ignore-gpu-blacklist --enable-accelerated-painting
--force-compositing-mode".  Start recording with chrome://tracing. Visit
google.com and wait for page to finish rendering.  Measure the time spent
by process CrGpuMain in GLES2DecoderImpl::HandleTexImage2D and
HandleTexSubImage2D.

System config:

    cpu: Sandybridge Mobile GT2+ (0x0126)
    kernel 3.4.9 x86_64
    chromium 21.0.1180.89 (154005)

Statistics:

                  | N   Median  Avg   Stddev
    --------------|-------------------------
    before (msec) | 8   472.5  463.75 72.6
    after  (msec) | 8    78.0   79.6   5.7

    Arithmetic difference at 95.0% confidence:
       -384.1  +/- 55.2 msec
        -82.8% +/- 11.9%

    Ratio at 95.0% confidence:
          5.81 +/- 0.119

v2:
    - Replace check for `intel->gen >= 6` with `intel->has_llc`, per
      danvet.
    - Fix typo in comment, s/throuh/through/.
    - Swap 'before' and 'after' rows in stat table.

v3:
    - If the current batch references the bo, then flush batch before mapping
      the bo. Found by Chris.
    - Restrict supported texture images to level 0 of target
      GL_TEXTURE_2D. This avoids an arithmetic bug in calculating image
      offsets within the miptree, found by Paul. This restriction does not
      diminish this patch's benefit to Chrome OS performance.
    - Use less instructions for bit6 swizzling, suggested by Paul.
    - Remove erroneous comment about Y-tiling, for Paul.
    - Print perf_debug messages when flushing and stalling.
    - Update stats in commit message; run workload under a release build
      rather than a debug build.

Note: This is a candidate for the 9.0 branch.
Acked-by: Eric Anholt <eric at anholt.net>
CC: Stéphane Marchesin <marcheu at chromium.org>
Signed-off-by: Chad Versace <chad.versace at linux.intel.com>

---

 src/mesa/drivers/dri/intel/intel_tex.h          |   11 ++
 src/mesa/drivers/dri/intel/intel_tex_image.c    |   12 ++
 src/mesa/drivers/dri/intel/intel_tex_subimage.c |  163 +++++++++++++++++++++++
 3 files changed, 186 insertions(+), 0 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_tex.h b/src/mesa/drivers/dri/intel/intel_tex.h
index 88a7d55..777574d 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.h
+++ b/src/mesa/drivers/dri/intel/intel_tex.h
@@ -85,4 +85,15 @@ bool intel_copy_texsubimage(struct intel_context *intel,
                             GLint x, GLint y,
                             GLsizei width, GLsizei height);
 
+bool
+intel_texsubimage_tiled_memcpy(struct gl_context *ctx,
+                               GLuint dims,
+                               struct gl_texture_image *texImage,
+                               GLint xoffset, GLint yoffset, GLint zoffset,
+                               GLsizei width, GLsizei height, GLsizei depth,
+                               GLenum format, GLenum type,
+                               const GLvoid *pixels,
+                               const struct gl_pixelstore_attrib *packing,
+                               bool for_glTexImage);
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index a08a5a2..7b9638f 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -206,10 +206,22 @@ intelTexImage(struct gl_context * ctx,
               GLenum format, GLenum type, const void *pixels,
               const struct gl_pixelstore_attrib *unpack)
 {
+   bool ok;
+
    DBG("%s target %s level %d %dx%dx%d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(texImage->TexObject->Target),
        texImage->Level, texImage->Width, texImage->Height, texImage->Depth);
 
+   ok = intel_texsubimage_tiled_memcpy(ctx, dims, texImage,
+                                       0, 0, 0, /*x,y,z offsets*/
+                                       texImage->Width,
+                                       texImage->Height,
+                                       texImage->Depth,
+                                       format, type, pixels, unpack,
+                                       true /*for_glTexImage*/);
+   if (ok)
+      return;
+
    /* Attempt to use the blitter for PBO image uploads.
     */
    if (dims <= 2 &&
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index ae4b3bc..d3a8736 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -26,6 +26,7 @@
  * 
  **************************************************************************/
 
+#include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/pbo.h"
 #include "main/texobj.h"
@@ -33,6 +34,7 @@
 #include "main/texcompress.h"
 #include "main/enums.h"
 
+#include "intel_batchbuffer.h"
 #include "intel_context.h"
 #include "intel_tex.h"
 #include "intel_mipmap_tree.h"
@@ -148,6 +150,157 @@ intel_blit_texsubimage(struct gl_context * ctx,
    return true;
 }
 
+/**
+ * \brief A fast path for glTexImage and glTexSubImage.
+ *
+ * \param for_glTexImage Was this called from glTexImage or glTexSubImage?
+ *
+ * This fast path is taken when the hardware natively supports the texture
+ * format (such as GL_BGRA) and when the texture memory is X-tiled. It uploads
+ * the texture data by mapping the texture memory without a GTT fence, thus
+ * acquiring a tiled view of the memory, and then memcpy'ing sucessive
+ * subspans within each tile.
+ *
+ * This is a performance win over the conventional texture upload path because
+ * it avoids the performance penalty of writing through the write-combine
+ * buffer. In the conventional texture upload path,
+ * texstore.c:store_texsubimage(), the texture memory is mapped through a GTT
+ * fence, thus acquiring a linear view of the memory, then each row in the
+ * image is memcpy'd. In this fast path, we replace each row's memcpy with
+ * a sequence of memcpy's over each bit6 swizzle span in the row.
+ *
+ * This fast path's use case is Google Chrome's paint rectangles.  Chrome (as
+ * of version 21) renders each page as a tiling of 256x256 GL_BGRA textures.
+ * Each page's content is initially uploaded with glTexImage2D and damaged
+ * regions are updated with glTexSubImage2D. On some workloads, the
+ * performance gain of this fastpath on Sandybridge is over 5x.
+ */
+bool
+intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
+                               GLuint dims,
+                               struct gl_texture_image *texImage,
+                               GLint xoffset, GLint yoffset, GLint zoffset,
+                               GLsizei width, GLsizei height, GLsizei depth,
+                               GLenum format, GLenum type,
+                               const GLvoid *pixels,
+                               const struct gl_pixelstore_attrib *packing,
+                               bool for_glTexImage)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_image *image = intel_texture_image(texImage);
+
+   /* The miptree's buffer. */
+   drm_intel_bo *bo;
+
+   int error = 0;
+
+   /* This fastpath is restricted to a specific texture type: level 0 of
+    * a 2D BGRA texture. It could be generalized to support more types by
+    * varying the arithmetic loop below.
+    */
+   if (!intel->has_llc ||
+       format != GL_BGRA ||
+       type != GL_UNSIGNED_BYTE ||
+       texImage->TexObject->Target != GL_TEXTURE_2D ||
+       texImage->Level != 0 ||
+       pixels == NULL ||
+       packing->Alignment > 4)
+      return false;
+
+   if (for_glTexImage)
+      ctx->Driver.AllocTextureImageBuffer(ctx, texImage);
+
+   if (!image->mt ||
+       image->mt->region->tiling != I915_TILING_X) {
+      /* The algorithm below is written only for X-tiled memory. */
+      return false;
+   }
+
+   bo = image->mt->region->bo;
+
+   if (drm_intel_bo_references(intel->batch.bo, bo)) {
+      perf_debug("Flushing before mapping a referenced bo.\n");
+      intel_batchbuffer_flush(intel);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      if (drm_intel_bo_busy(bo)) {
+         perf_debug("Mapping a busy BO, causing a stall on the GPU.\n");
+      }
+   }
+
+   error = drm_intel_bo_map(bo, true /*write_enable*/);
+   if (error || bo->virtual == NULL) {
+      DBG("%s: failed to map bo\n", __FUNCTION__);
+      return false;
+   }
+
+   /* We postponed printing this message until having committed to executing
+    * the function.
+    */
+   DBG("%s: level=%d offset=(%d,%d) (w,h)=(%d,%d)\n",
+       __FUNCTION__, texImage->Level, xoffset, yoffset, width, height);
+
+   /* In the tiling algorithm below, some variables are in units of pixels,
+    * others are in units of bytes, and others (such as height) are unitless.
+    * Each variable name is suffixed with its units.
+    */
+
+   const uint32_t x_max_pixels = xoffset + width;
+   const uint32_t y_max_pixels = yoffset + height;
+
+   const uint32_t tile_size_bytes = 4096;
+
+   const uint32_t tile_width_bytes = 512;
+   const uint32_t tile_width_pixels = 128;
+
+   const uint32_t tile_height = 8;
+
+   const uint32_t cpp = 4; /* chars per pixel of GL_BGRA */
+   const uint32_t swizzle_width_pixels = 16;
+
+   const uint32_t stride_bytes = image->mt->region->pitch * cpp;
+   const uint32_t width_tiles = stride_bytes / tile_width_bytes;
+
+   for (uint32_t y_pixels = yoffset; y_pixels < y_max_pixels; ++y_pixels) {
+      const uint32_t y_offset_bytes = (y_pixels / tile_height) * width_tiles * tile_size_bytes
+                                    + (y_pixels % tile_height) * tile_width_bytes;
+
+      for (uint32_t x_pixels = xoffset; x_pixels < x_max_pixels; x_pixels += swizzle_width_pixels) {
+         const uint32_t x_offset_bytes = (x_pixels / tile_width_pixels) * tile_size_bytes
+                                       + (x_pixels % tile_width_pixels) * cpp;
+
+         intptr_t offset_bytes = y_offset_bytes + x_offset_bytes;
+         if (intel->has_swizzling) {
+#if 0
+            /* Clear, unoptimized version. */
+            bool bit6 = (offset_bytes >> 6) & 1;
+            bool bit9 = (offset_bytes >> 9) & 1;
+            bool bit10 = (offset_bytes >> 10) & 1;
+
+            if (bit9 ^ bit10)
+               offset_bytes ^= (1 << 6);
+#else
+            /* Optimized, obfuscated version. */
+            offset_bytes ^= ((offset_bytes >> 3) ^ (offset_bytes >> 4))
+                          & (1 << 6);
+#endif
+         }
+
+         const uint32_t swizzle_bound_pixels = ALIGN(x_pixels + 1, swizzle_width_pixels);
+         const uint32_t memcpy_bound_pixels = MIN2(x_max_pixels, swizzle_bound_pixels);
+         const uint32_t copy_size = cpp * (memcpy_bound_pixels - x_pixels);
+
+         memcpy(bo->virtual + offset_bytes, pixels, copy_size);
+         pixels += copy_size;
+         x_pixels -= (x_pixels % swizzle_width_pixels);
+      }
+   }
+
+   drm_intel_bo_unmap(bo);
+   return true;
+}
+
 static void
 intelTexSubImage(struct gl_context * ctx,
                  GLuint dims,
@@ -158,6 +311,16 @@ intelTexSubImage(struct gl_context * ctx,
                  const GLvoid * pixels,
                  const struct gl_pixelstore_attrib *packing)
 {
+   bool ok;
+
+   ok = intel_texsubimage_tiled_memcpy(ctx, dims, texImage,
+                                       xoffset, yoffset, zoffset,
+                                       width, height, depth,
+                                       format, type, pixels, packing,
+                                       false /*for_glTexImage*/);
+   if (ok)
+     return;
+
    /* The intel_blit_texsubimage() function only handles 2D images */
    if (dims != 2 || !intel_blit_texsubimage(ctx, texImage,
 			       xoffset, yoffset,