[Mesa-dev] [PATCH 3/4] i965: Use write-combine mappings where available

Chris Wilson chris at chris-wilson.co.uk
Wed Jul 12 09:36:32 UTC 2017


Quoting Kenneth Graunke (2017-07-12 08:22:24)
> From: Matt Turner <mattst88 at gmail.com>
> 
> Write-combine mappings give much better performance on writes than
> uncached access through the GTT.
> 
> Improves performance of GFXBench 4's gl_driver2 benchmark at 1024x768
> on Apollolake by 3.6086% +/- 0.674193% (n=15).
> 
> v2: (by Ken) Rebase on lockless mappings, map_count deletion, valgrind
>     updates, potential for CPU/WC maps failing, and other changes.
> 
> Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
> ---
>  src/mesa/drivers/dri/i965/brw_bufmgr.c | 61 ++++++++++++++++++++++++++++++++--
>  1 file changed, 59 insertions(+), 2 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> index 62a818594f1..85ff3aeb0dd 100644
> --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
> +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> @@ -121,6 +121,7 @@ struct brw_bufmgr {
>     struct hash_table *handle_table;
>  
>     bool has_llc:1;
> +   bool has_mmap_wc:1;
>     bool bo_reuse:1;
>  };
>  
> @@ -742,8 +743,48 @@ brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
>  static void *
>  brw_bo_map_wc(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
>  {
> -   /* Intentionally left blank */
> -   return NULL;
> +   struct brw_bufmgr *bufmgr = bo->bufmgr;
> +
> +   if (!bufmgr->has_mmap_wc)
> +      return NULL;
> +
> +   if (!bo->map_wc) {
> +      struct drm_i915_gem_mmap mmap_arg;
> +      void *map;
> +
> +      DBG("brw_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name);
> +
> +      memclear(mmap_arg);
> +      mmap_arg.handle = bo->gem_handle;
> +      mmap_arg.size = bo->size;
> +      mmap_arg.flags = I915_MMAP_WC;
> +      int ret = drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
> +      if (ret != 0) {
> +         ret = -errno;
> +         DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
> +             __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
> +         return NULL;
> +      }
> +
> +      map = (void *) (uintptr_t) mmap_arg.addr_ptr;
> +      VG_DEFINED(map, bo->size);
> +
> +      if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) {
> +         VG_NOACCESS(map, bo->size);
> +         drm_munmap(map, bo->size);
> +      }
> +   }
> +   assert(bo->map_wc);
> +
> +   DBG("brw_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc);
> +   print_flags(flags);
> +
> +   if (!(flags & MAP_ASYNC)) {
> +      set_domain(brw, "WC mapping", bo, I915_GEM_DOMAIN_GTT,
> +                 flags & MAP_WRITE ? I915_GEM_DOMAIN_GTT : 0);

Ah, a read via WC of a write via GTT (so mmap_gtt, pwrite) may result in
stale data. Given the construction of brw_bo_map() I don't think that
combination is possible. Hmm, but what about GTT fenced uploads versus
direct WC detiling? (And oh, we haven't killed pwrite yet.)

/* Note that WC/GTT are not simply interchangeable as GTT is not quite
 * as coherent as it seems. Writes via the GTT are internally buffered
 * and delayed by the chipset, such that a direct read to the backing
 * storage by WC or by WB pathways can overtake the write and return
 * the old data. (Most noticeable on modern Atoms.) The canonical fix for
 * this in the kernel is to use an uncached mmio write as that provides
 * the necessary ordering, as clearing the Write-Combining Buffer using
 * mfence is insufficient.
 */

Hmm, it should also be sufficient to do a GTT read:

	if (bo->map_gtt)
		READ_ONCE(*(uint32_t *)bo->map_gtt);

but for the moment I strongly recommend using I915_GEM_DOMAIN_WC...
Except we have the ASYNC conundrum :)

>  static void *
> @@ -1208,6 +1249,21 @@ brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset, uint64_t *result)
>     return ret;
>  }
>  
> +static int
> +gem_param(int fd, int name)
> +{
> +   drm_i915_getparam_t gp;
> +   int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
> +
> +   memset(&gp, 0, sizeof(gp));
> +   gp.param = name;
> +   gp.value = &v;
> +   if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
> +      return -1;
> +
> +   return v;
> +}
> +
>  /**
>   * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
>   * and manage map buffer objections.
> @@ -1240,6 +1296,7 @@ brw_bufmgr_init(struct gen_device_info *devinfo, int fd, int batch_size)
>     }
>  
>     bufmgr->has_llc = devinfo->has_llc;
> +   bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;

Hah, I mucked this up by adding the WC flag to mmap version and the WC
domain to mmap_gtt version.

has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
has_domain_wc = gem_param(fd, i915_PARAM_MMAP_GTT_VERSION) >= 2;
bufmgr->use_mmap_wc = has_mmap_wc && has_domain_wc;

After removing the set-domain (with the manual coherency dance) this can
be reduced back down to bufmgr->use_mmap_wc = has_map_wc;
-Chris


More information about the mesa-dev mailing list