[Mesa-dev] [PATCH 09/18] i965: Enable GPU snooping of CPU caches for select buffers

Chris Wilson chris at chris-wilson.co.uk
Mon Jul 6 03:33:14 PDT 2015


On LLC, all buffers are normally cache coherent between the CPU and the
GPU, giving both parties fast access to shared data.

However, older architectures or Atoms, do not implement LLC between the
CPU and GPU. Instead they utilise a snooping architecture where the GPU
can snoop the CPU cache when told. The snooping has much higher overhead
(i.e. slower) than regular memory fetches so its use should be reserved
to instances where the data likely resides in the CPU cache and the GPU
need only use it once (e.g. for streaming textures from the CPU). The
other major benefit is that it can also push data to the CPU in a cache
coherent fashion, and so CPU access to a snooped buffer is very fast,
making it preferrable for anytime we need to readback data from the GPU

For demonstration we set the snoop flag for reading back via a blit from
a miptree, and for reading back transform feedback.
---
 src/mesa/drivers/dri/i965/brw_batch.c         | 31 +++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_batch.h         |  9 ++++++++
 src/mesa/drivers/dri/i965/brw_context.h       |  1 +
 src/mesa/drivers/dri/i965/gen6_sol.c          |  3 ++-
 src/mesa/drivers/dri/i965/gen7_sol_state.c    |  2 +-
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c |  1 +
 6 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_batch.c b/src/mesa/drivers/dri/i965/brw_batch.c
index bfff9fe..e01a0c4 100644
--- a/src/mesa/drivers/dri/i965/brw_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_batch.c
@@ -1323,6 +1323,37 @@ struct brw_bo *brw_bo_create(struct brw_batch *batch,
    return bo;
 }
 
+static bool __brw_bo_set_caching(struct brw_bo *bo, int caching)
+{
+   struct drm_i915_gem_caching arg;
+
+   memset(&arg, 0, sizeof(arg));
+   arg.handle = bo->handle;
+   arg.caching = caching;
+   return drmIoctl(bo->batch->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0;
+}
+
+void brw_bo_enable_snoop(struct brw_bo *bo)
+{
+   assert(bo->reusable);
+
+   if (bo->cache_coherent)
+      return;
+
+   if (bo->tiling)
+      return; /* XXX abort? */
+
+   if (!__brw_bo_set_caching(bo, I915_CACHING_CACHED))
+      return;
+
+   drm_intel_bo_disable_reuse(bo->base);
+   if (bo->reusable)
+      list_move(&bo->link, &bo->batch->inactive);
+
+   bo->reusable = false;
+   bo->cache_coherent = true;
+}
+
 static uint64_t brw_surface_size(int cpp,
 				 uint32_t width,
 				 uint32_t height,
diff --git a/src/mesa/drivers/dri/i965/brw_batch.h b/src/mesa/drivers/dri/i965/brw_batch.h
index 5b56e82..3628b03 100644
--- a/src/mesa/drivers/dri/i965/brw_batch.h
+++ b/src/mesa/drivers/dri/i965/brw_batch.h
@@ -220,6 +220,15 @@ struct brw_bo *brw_bo_create_from_name(struct brw_batch *batch,
 				       const char *name,
 				       uint32_t global_name);
 
+/* Enable CPU cache coherent to the buffer. On LLC, normally all buffers
+ * are cache coherent, but on non-LLC architectures we can tell the GPU
+ * to snoop from and to flush into the CPU cache. Performing the snoop
+ * is slower for the GPU, but eliminates the uncached penalty from the CPU,
+ * so it only useful for streaming data (read once) to the GPU or when
+ * we need to read anything back from the GPU.
+ */
+void brw_bo_enable_snoop(struct brw_bo *bo);
+
 void brw_bo_mark_dirty(struct brw_batch *batch, struct brw_bo *bo);
 
 inline static int brw_bo_madvise(struct brw_bo *bo, int state)
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 63f4f87..3aa003c 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -872,6 +872,7 @@ struct brw_transform_feedback_object {
     * Count of primitives generated during this transform feedback operation.
     *  @{
     */
+#define BRW_XFB_BO_SIZE (16<<10)
    uint64_t prims_generated[BRW_MAX_XFB_STREAMS];
    struct brw_bo *prim_count_bo;
    unsigned prim_count_buffer_index; /**< in number of uint64_t units */
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 39bf8e9..037aad0 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -206,7 +206,8 @@ brw_new_transform_feedback(struct gl_context *ctx, GLuint name)
    brw_obj->offset_bo =
       brw_bo_create(&brw->batch, "transform feedback offsets", 16, 64, 0);
    brw_obj->prim_count_bo =
-      brw_bo_create(&brw->batch, "xfb primitive counts", 4096, 64, 0);
+      brw_bo_create(&brw->batch, "xfb primitive counts", BRW_XFB_BO_SIZE, 64,0);
+   brw_bo_enable_snoop(brw_obj->prim_count_bo);
 
    return &brw_obj->base;
 }
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index ff6678e..857ebe5 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -350,7 +350,7 @@ gen7_save_primitives_written_counters(struct brw_context *brw,
 
    /* Check if there's enough space for a new pair of four values. */
    if (obj->prim_count_bo != NULL &&
-       obj->prim_count_buffer_index + 2 * streams >= 4096 / sizeof(uint64_t)) {
+       obj->prim_count_buffer_index + 2 * streams >= BRW_XFB_BO_SIZE / sizeof(uint64_t)) {
       /* Gather up the results so far and release the BO. */
       gen7_tally_prims_generated(brw, obj);
    }
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index f9dc74f..d3a9152 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2139,6 +2139,7 @@ intel_miptree_map_blit(struct brw_context *brw,
       goto fail;
    }
    map->stride = map->mt->pitch;
+   brw_bo_enable_snoop(map->mt->bo);
 
    /* One of either READ_BIT or WRITE_BIT or both is set.  READ_BIT implies no
     * INVALIDATE_RANGE_BIT.  WRITE_BIT needs the original values read in unless
-- 
2.1.4



More information about the mesa-dev mailing list