[Mesa-dev] [PATCH 3/5] i965: Use blorp+userptr for GPU readback

Chris Wilson chris at chris-wilson.co.uk
Sun Jun 10 12:00:32 UTC 2018


The primary benefit for this is that we get format conversion for
"free", along with detiling and cache flushing (most relevant for !llc).
Using the GPU does impose a bandwidth cost that is presumably better
used for rendering, hence we limit the use to readback into client
memory (not pbo) where we would need to stall on the GPU anyway.
(Uploads remain direct/staged to avoid the synchronisation cost.)
And we only use the GPU path if a direct read into client memory from
video memory is unavailable.

The ultimate user of this is Xorg/glamor! On byt, bsw, bxt (and
presumably but not measured ilk), x11perf -shmget500 is improved by
about 5-fold, but only equivalent to enabling movntqda, which is the
preferred path. Though conversely the overhead of executing and waiting
upon an additional blorp batch is shown by x11perf -shmget10 being
reduced by a factor of 2. I think it is fair to presume that large copies
will dominate (and that the overhead of a single batch is something that
we can iteratively reduce, for the benefit of all.) llc machines continue
to use direct access where there is no format changes (which one hopes is
the typical use case).

Opens:
- Is blorp missing a resolve?
glx/glx-copy-sub-buffer samples=16: pass fail
glx/glx-copy-sub-buffer samples=2: pass fail
glx/glx-copy-sub-buffer samples=4: pass fail
glx/glx-copy-sub-buffer samples=6: pass fail
glx/glx-copy-sub-buffer samples=8: pass fail
spec/!opengl 1.1/read-front clear-front-first samples=16: pass fail
spec/!opengl 1.1/read-front clear-front-first samples=2: pass fail
spec/!opengl 1.1/read-front clear-front-first samples=4: pass fail
spec/!opengl 1.1/read-front clear-front-first samples=6: pass fail
spec/!opengl 1.1/read-front clear-front-first samples=8: pass fail
spec/!opengl 1.1/read-front samples=16: pass fail
spec/!opengl 1.1/read-front samples=2: pass fail
spec/!opengl 1.1/read-front samples=4: pass fail
spec/!opengl 1.1/read-front samples=6: pass fail
spec/!opengl 1.1/read-front samples=8: pass fail

Cc: Jason Ekstrand <jason.ekstrand at intel.com>
Cc: Topi Pohjolainen <topi.pohjolainen at intel.com>
Cc: Kenneth Graunke <kenneth at whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_blorp.c        | 74 ++++++++++++++------
 src/mesa/drivers/dri/i965/intel_pixel_read.c | 21 +++---
 src/mesa/drivers/dri/i965/intel_tex_image.c  | 27 ++++---
 3 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c b/src/mesa/drivers/dri/i965/brw_blorp.c
index 8c6d77e1b7d..1e65217247b 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -36,6 +36,7 @@
 #include "brw_defines.h"
 #include "brw_meta_util.h"
 #include "brw_state.h"
+#include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 #include "intel_fbo.h"
 #include "common/gen_debug.h"
@@ -769,8 +770,11 @@ blorp_get_client_bo(struct brw_context *brw,
                     GLenum target, GLenum format, GLenum type,
                     const void *pixels,
                     const struct gl_pixelstore_attrib *packing,
-                    uint32_t *offset_out, uint32_t *row_stride_out,
-                    uint32_t *image_stride_out, bool read_only)
+                    bool read_only,
+                    uint32_t *offset_out,
+                    uint32_t *row_stride_out,
+                    uint32_t *image_stride_out,
+                    bool *sync_out)
 {
    /* Account for SKIP_PIXELS, SKIP_ROWS, ALIGNMENT, and SKIP_IMAGES */
    const GLuint dims = _mesa_get_texture_dimensions(target);
@@ -785,14 +789,15 @@ blorp_get_client_bo(struct brw_context *brw,
 
    *row_stride_out = stride;
    *image_stride_out = _mesa_image_image_stride(packing, w, h, format, type);
+   *sync_out = false;
 
-   if (_mesa_is_bufferobj(packing->BufferObj)) {
-      const uint32_t offset = first_pixel + (intptr_t)pixels;
-      if (!read_only && ((offset % cpp) || (stride % cpp))) {
-         perf_debug("Bad PBO alignment; fallback to CPU mapping\n");
-         return NULL;
-      }
+   const uintptr_t offset = first_pixel + (uintptr_t)pixels;
+   if (!read_only && (offset % cpp || stride % cpp)) {
+      perf_debug("Bad PBO alignment; fallback to CPU mapping\n");
+      return NULL;
+   }
 
+   if (_mesa_is_bufferobj(packing->BufferObj)) {
       /* This is a user-provided PBO. We just need to get the BO out */
       struct intel_buffer_object *intel_pbo =
          intel_buffer_object(packing->BufferObj);
@@ -807,13 +812,10 @@ blorp_get_client_bo(struct brw_context *brw,
 
       *offset_out = offset;
       return bo;
-   } else {
+   } else if (read_only) {
       /* Someone should have already checked that there is data to upload. */
       assert(pixels);
 
-      /* Creating a temp buffer currently only works for upload */
-      assert(read_only);
-
       /* This is not a user-provided PBO.  Instead, pixels is a pointer to CPU
        * data which we need to copy into a BO.
        */
@@ -826,7 +828,7 @@ blorp_get_client_bo(struct brw_context *brw,
          return NULL;
       }
 
-      if (brw_bo_subdata(bo, 0, size, pixels + first_pixel)) {
+      if (brw_bo_subdata(bo, 0, size, (void *)offset)) {
          perf_debug("intel_texsubimage: temp bo upload failed\n");
          brw_bo_unreference(bo);
          return NULL;
@@ -834,6 +836,23 @@ blorp_get_client_bo(struct brw_context *brw,
 
       *offset_out = 0;
       return bo;
+   } else if (brw->screen->kernel_features & KERNEL_ALLOWS_USERPTR) {
+      uintptr_t first_page = offset & -4096;
+      uintptr_t last_page = ALIGN((uintptr_t)(pixels + last_pixel), 4096);
+
+      struct brw_bo *bo =
+	 brw_bo_alloc_userptr(brw->bufmgr, "tex_subimage_userptr",
+			      (void *)first_page, last_page - first_page);
+      if (bo == NULL) {
+         perf_debug("intel_texsubimage: userptr mapping failed\n");
+         return NULL;
+      }
+
+      *offset_out = offset - first_page;
+      *sync_out = true;
+      return bo;
+   } else {
+      return NULL;
    }
 }
 
@@ -910,12 +929,15 @@ brw_blorp_upload_miptree(struct brw_context *brw,
    if (need_signed_unsigned_int_conversion(src_format, dst_format))
       return false;
 
+   bool src_sync;
    uint32_t src_offset, src_row_stride, src_image_stride;
    struct brw_bo *src_bo =
       blorp_get_client_bo(brw, width, height, depth,
-                          target, format, type, pixels, packing,
-                          &src_offset, &src_row_stride,
-                          &src_image_stride, true);
+                          target, format, type, pixels, packing, true,
+                          &src_offset,
+                          &src_row_stride,
+                          &src_image_stride,
+                          &src_sync);
    if (src_bo == NULL)
       return false;
 
@@ -977,6 +999,10 @@ brw_blorp_upload_miptree(struct brw_context *brw,
    result = true;
 
 err:
+   if (src_sync) {
+      intel_batchbuffer_flush(brw);
+      brw_bo_wait_rendering(src_bo);
+   }
    brw_bo_unreference(src_bo);
 
    return result;
@@ -1022,15 +1048,15 @@ brw_blorp_download_miptree(struct brw_context *brw,
       break;
    }
 
-   /* This pass only works for PBOs */
-   assert(_mesa_is_bufferobj(packing->BufferObj));
-
+   bool dst_sync;
    uint32_t dst_offset, dst_row_stride, dst_image_stride;
    struct brw_bo *dst_bo =
       blorp_get_client_bo(brw, width, height, depth,
-                          target, format, type, pixels, packing,
-                          &dst_offset, &dst_row_stride,
-                          &dst_image_stride, false);
+                          target, format, type, pixels, packing, false,
+                          &dst_offset,
+                          &dst_row_stride,
+                          &dst_image_stride,
+                          &dst_sync);
    if (dst_bo == NULL)
       return false;
 
@@ -1121,6 +1147,10 @@ brw_blorp_download_miptree(struct brw_context *brw,
    brw_emit_mi_flush(brw);
 
 err:
+   if (dst_sync) {
+      intel_batchbuffer_flush(brw);
+      brw_bo_wait_rendering(dst_bo);
+   }
    brw_bo_unreference(dst_bo);
 
    return result;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 6ed7895bc76..3dbb02d97fb 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -258,8 +258,6 @@ intelReadPixels(struct gl_context * ctx,
                 GLenum format, GLenum type,
                 const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
 {
-   bool ok;
-
    struct brw_context *brw = brw_context(ctx);
    bool dirty;
 
@@ -272,19 +270,18 @@ intelReadPixels(struct gl_context * ctx,
    intel_prepare_render(brw);
    brw->front_buffer_dirty = dirty;
 
-   if (_mesa_is_bufferobj(pack->BufferObj)) {
-      if (intel_readpixels_blorp(ctx, x, y, width, height,
-                                 format, type, pixels, pack))
-         return;
-
-      perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
-   }
+   if (!_mesa_is_bufferobj(pack->BufferObj) &&
+       intel_readpixels_tiled_memcpy(ctx, x, y, width, height,
+                                     format, type, pixels, pack))
+      return;
 
-   ok = intel_readpixels_tiled_memcpy(ctx, x, y, width, height,
-                                      format, type, pixels, pack);
-   if(ok)
+   if (intel_readpixels_blorp(ctx, x, y, width, height,
+                              format, type, pixels, pack))
       return;
 
+   perf_debug("%s: fallback to CPU mapping for %s\n", __func__,
+              _mesa_is_bufferobj(pack->BufferObj) ? "PBO" : "memory");
+
    /* Update Mesa state before calling _mesa_readpixels().
     * XXX this may not be needed since ReadPixels no longer uses the
     * span code.
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index fae179214dd..bfc9132caa5 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -818,27 +818,24 @@ intel_get_tex_sub_image(struct gl_context *ctx,
                         struct gl_texture_image *texImage)
 {
    struct brw_context *brw = brw_context(ctx);
-   bool ok;
 
    DBG("%s\n", __func__);
 
-   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
-      if (intel_gettexsubimage_blorp(brw, texImage,
-                                     xoffset, yoffset, zoffset,
-                                     width, height, depth, format, type,
-                                     pixels, &ctx->Pack))
-         return;
-
-      perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
-   }
-
-   ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset,
-                                          width, height,
-                                          format, type, pixels, &ctx->Pack);
+   if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) &&
+       intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset,
+                                         width, height,
+                                         format, type, pixels, &ctx->Pack))
+      return;
 
-   if(ok)
+   if (intel_gettexsubimage_blorp(brw, texImage,
+                                  xoffset, yoffset, zoffset,
+                                  width, height, depth, format, type,
+                                  pixels, &ctx->Pack))
       return;
 
+   perf_debug("%s: fallback to CPU mapping for %s\n", __func__,
+              _mesa_is_bufferobj(ctx->Pack.BufferObj) ? "PBO" : "memory");
+
    _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset,
                              width, height, depth,
                              format, type, pixels, texImage);
-- 
2.17.1



More information about the mesa-dev mailing list