[Mesa-dev] [PATCH 10/16] i965: Add and use brw_bo_map()

Tue May 30 19:48:39 UTC 2017

On Fri, May 26, 2017 at 01:55:47AM -0700, Kenneth Graunke wrote:
> On Wednesday, May 24, 2017 1:04:52 PM PDT Matt Turner wrote:
> > We can encapsulate the logic for choosing the mapping type. This will
> > also help when we add WC mappings.
> 
> It might be worth mentioning that this patch changes a few mappings from
> CPU to GTT on non-LLC systems.  They may even be bug fixes...

I think performance fixes/changes at most, cpu mmaps on non-llc just means
you'll ask the kernel to clflush the entire thing for you. Twice if you
ask for write access. Otoh read access to the gtt (without special
instructions at least) is going to be roughly 3 orders of magnitude slower
than cpu mmap access + clflush. You might hit a surprise here.

Probably the fastest path for reading would be to cpu mmap, but not tell
the kernel, and instead do the clflushing in userspace in-line with the
reads. That both avoids a bunch of thrashing, and if you only do partial
reads/writes on a range is much faster on top. Not sure how much you care
about read throughput though.
-Daniel

> 
> I'll call them out below...
> 
> > ---
> >  src/mesa/drivers/dri/i965/brw_bufmgr.c            | 30 +++++++++++++++++++++--
> >  src/mesa/drivers/dri/i965/brw_bufmgr.h            |  5 ++--
> >  src/mesa/drivers/dri/i965/brw_performance_query.c |  6 ++---
> >  src/mesa/drivers/dri/i965/brw_program.c           |  2 +-
> >  src/mesa/drivers/dri/i965/brw_program_cache.c     |  6 ++---
> >  src/mesa/drivers/dri/i965/brw_queryobj.c          |  2 +-
> >  src/mesa/drivers/dri/i965/gen6_queryobj.c         |  2 +-
> >  src/mesa/drivers/dri/i965/gen6_sol.c              |  2 +-
> >  src/mesa/drivers/dri/i965/intel_batchbuffer.c     |  4 +--
> >  src/mesa/drivers/dri/i965/intel_buffer_objects.c  | 13 ++--------
> >  src/mesa/drivers/dri/i965/intel_mipmap_tree.c     | 19 ++------------
> >  src/mesa/drivers/dri/i965/intel_pixel_read.c      |  2 +-
> >  src/mesa/drivers/dri/i965/intel_screen.c          |  4 +--
> >  src/mesa/drivers/dri/i965/intel_tex_image.c       |  2 +-
> >  src/mesa/drivers/dri/i965/intel_tex_subimage.c    |  2 +-
> >  src/mesa/drivers/dri/i965/intel_upload.c          |  5 +---
> >  16 files changed, 52 insertions(+), 54 deletions(-)
> > 
> > diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> > index b79f566..ec9611f 100644
> > --- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
> > +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
> > @@ -658,7 +658,7 @@ set_domain(struct brw_context *brw, const char *action,
> >     }
> >  }
> >  
> > -void *
> > +static void *
> >  brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
> >  {
> >     struct brw_bufmgr *bufmgr = bo->bufmgr;
> > @@ -740,7 +740,7 @@ map_gtt(struct brw_bo *bo)
> >     return bo->map_gtt;
> >  }
> >  
> > -void *
> > +static void *
> >  brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
> >  {
> >     struct brw_bufmgr *bufmgr = bo->bufmgr;
> > @@ -814,6 +814,32 @@ brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo)
> >     return map;
> >  }
> >  
> > +static bool
> > +can_map_cpu(struct brw_bo *bo, unsigned flags)
> > +{
> > +   if (bo->cache_coherent)
> > +      return true;
> > +
> > +   if (flags & MAP_PERSISTENT)
> > +      return false;
> > +
> > +   if (flags & MAP_COHERENT)
> > +      return false;
> > +
> > +   return !(flags & MAP_WRITE);
> > +}
> > +
> > +void *
> > +brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags)
> > +{
> > +   if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW))
> > +      return brw_bo_map_gtt(brw, bo, flags);
> > +   else if (can_map_cpu(bo, flags))
> > +      return brw_bo_map_cpu(brw, bo, flags);
> > +   else
> > +      return brw_bo_map_gtt(brw, bo, flags);
> > +}
> > +
> >  int
> >  brw_bo_unmap(struct brw_bo *bo)
> >  {
> > diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
> > index 831da69..099afcf 100644
> > --- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
> > +++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
> > @@ -137,7 +137,7 @@ struct brw_bo {
> >   *
> >   * Buffer objects are not necessarily initially mapped into CPU virtual
> >   * address space or graphics device aperture.  They must be mapped
> > - * using brw_bo_map_cpu() or brw_bo_map_gtt() to be used by the CPU.
> > + * using brw_bo_map() to be used by the CPU.
> >   */
> >  struct brw_bo *brw_bo_alloc(struct brw_bufmgr *bufmgr, const char *name,
> >                              uint64_t size, uint64_t alignment);
> > @@ -189,7 +189,7 @@ void brw_bo_unreference(struct brw_bo *bo);
> >   * This function will block waiting for any existing execution on the
> >   * buffer to complete, first.  The resulting mapping is returned.
> >   */
> > -MUST_CHECK void *brw_bo_map_cpu(struct brw_context *brw, struct brw_bo *bo, unsigned flags);
> > +MUST_CHECK void *brw_bo_map(struct brw_context *brw, struct brw_bo *bo, unsigned flags);
> >  
> >  /**
> >   * Reduces the refcount on the userspace mapping of the buffer
> > @@ -263,7 +263,6 @@ struct brw_bo *brw_bo_gem_create_from_name(struct brw_bufmgr *bufmgr,
> >                                             unsigned int handle);
> >  void brw_bufmgr_enable_reuse(struct brw_bufmgr *bufmgr);
> >  MUST_CHECK void *brw_bo_map_unsynchronized(struct brw_context *brw, struct brw_bo *bo);
> > -MUST_CHECK void *brw_bo_map_gtt(struct brw_context *brw, struct brw_bo *bo, unsigned flags);
> >  
> >  int brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns);
> >  
> > diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
> > index 2ec070b..1c9ddf5 100644
> > --- a/src/mesa/drivers/dri/i965/brw_performance_query.c
> > +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
> > @@ -713,7 +713,7 @@ accumulate_oa_reports(struct brw_context *brw,
> >     if (!read_oa_samples(brw))
> >        goto error;
> >  
> > -   query_buffer = brw_bo_map_cpu(brw, obj->oa.bo, MAP_READ);
> > +   query_buffer = brw_bo_map(brw, obj->oa.bo, MAP_READ);
> >  
> >     start = last = query_buffer;
> >     end = query_buffer + (MI_RPC_BO_END_OFFSET_BYTES / sizeof(uint32_t));
> > @@ -992,7 +992,7 @@ brw_begin_perf_query(struct gl_context *ctx,
> >                        MI_RPC_BO_SIZE, 64);
> >  #ifdef DEBUG
> >        /* Pre-filling the BO helps debug whether writes landed. */
> > -      void *map = brw_bo_map_cpu(brw, obj->oa.bo, MAP_WRITE);
> > +      void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE);
> >        memset(map, 0x80, MI_RPC_BO_SIZE);
> >        brw_bo_unmap(obj->oa.bo);
> >  #endif
> 
> This will get changed from a CPU mapping to a GTT mapping on non-LLC
> systems.  I doubt this will realistically have any impact.
> 
> > @@ -1214,7 +1214,7 @@ get_pipeline_stats_data(struct brw_context *brw,
> >     int n_counters = obj->query->n_counters;
> >     uint8_t *p = data;
> >  
> > -   uint64_t *start = brw_bo_map_cpu(brw, obj->pipeline_stats.bo, MAP_READ);
> > +   uint64_t *start = brw_bo_map(brw, obj->pipeline_stats.bo, MAP_READ);
> >     uint64_t *end = start + (STATS_BO_END_OFFSET_BYTES / sizeof(uint64_t));
> >  
> >     for (int i = 0; i < n_counters; i++) {
> > diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
> > index 7f87e73..bff3475 100644
> > --- a/src/mesa/drivers/dri/i965/brw_program.c
> > +++ b/src/mesa/drivers/dri/i965/brw_program.c
> > @@ -578,7 +578,7 @@ brw_collect_shader_time(struct brw_context *brw)
> >      * delaying reading the reports, but it doesn't look like it's a big
> >      * overhead compared to the cost of tracking the time in the first place.
> >      */
> > -   void *bo_map = brw_bo_map_cpu(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
> > +   void *bo_map = brw_bo_map(brw, brw->shader_time.bo, MAP_READ | MAP_WRITE);
> 
> This will change from a CPU map to a GTT map on non-LLC due to
> MAP_WRITE.  This may be a bug fix for INTEL_DEBUG=shader_time.
> 
> >  
> >     for (int i = 0; i < brw->shader_time.num_entries; i++) {
> >        uint32_t *times = bo_map + i * 3 * BRW_SHADER_TIME_STRIDE;
> > diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
> > index 079e2ae..ab03969 100644
> > --- a/src/mesa/drivers/dri/i965/brw_program_cache.c
> > +++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
> > @@ -227,7 +227,7 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
> >        if (brw->has_llc) {
> >           memcpy(llc_map, cache->map, cache->next_offset);
> >        } else {
> > -         void *map = brw_bo_map_cpu(brw, cache->bo, MAP_READ);
> > +         void *map = brw_bo_map(brw, cache->bo, MAP_READ);
> >           brw_bo_subdata(new_bo, 0, cache->next_offset, map);
> >           brw_bo_unmap(cache->bo);
> >        }
> > @@ -268,7 +268,7 @@ brw_lookup_prog(const struct brw_cache *cache,
> >  
> >           void *map;
> >           if (!brw->has_llc)
> > -            map = brw_bo_map_cpu(brw, cache->bo, MAP_READ);
> > +            map = brw_bo_map(brw, cache->bo, MAP_READ);
> >           else
> >              map = cache->map;
> >  
> > @@ -550,7 +550,7 @@ brw_print_program_cache(struct brw_context *brw)
> >     void *map;
> >  
> >     if (!brw->has_llc)
> > -      map = brw_bo_map_cpu(brw, cache->bo, MAP_READ);
> > +      map = brw_bo_map(brw, cache->bo, MAP_READ);
> >     else
> >        map = cache->map;
> >  
> > diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
> > index 05e23cd..a7b8962 100644
> > --- a/src/mesa/drivers/dri/i965/brw_queryobj.c
> > +++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
> > @@ -146,7 +146,7 @@ brw_queryobj_get_results(struct gl_context *ctx,
> >        }
> >     }
> >  
> > -   results = brw_bo_map_cpu(brw, query->bo, MAP_READ);
> > +   results = brw_bo_map(brw, query->bo, MAP_READ);
> >     switch (query->Base.Target) {
> >     case GL_TIME_ELAPSED_EXT:
> >        /* The query BO contains the starting and ending timestamps.
> > diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> > index 160182a..d27b0d1 100644
> > --- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
> > +++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
> > @@ -212,7 +212,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
> >     if (query->bo == NULL)
> >        return;
> >  
> > -   uint64_t *results = brw_bo_map_cpu(brw, query->bo, MAP_READ);
> > +   uint64_t *results = brw_bo_map(brw, query->bo, MAP_READ);
> >     switch (query->Base.Target) {
> >     case GL_TIME_ELAPSED:
> >        /* The query BO contains the starting and ending timestamps.
> > diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
> > index 5873afd..b4824b6 100644
> > --- a/src/mesa/drivers/dri/i965/gen6_sol.c
> > +++ b/src/mesa/drivers/dri/i965/gen6_sol.c
> > @@ -247,7 +247,7 @@ tally_prims_generated(struct brw_context *brw,
> >     if (unlikely(brw->perf_debug && brw_bo_busy(obj->prim_count_bo)))
> >        perf_debug("Stalling for # of transform feedback primitives written.\n");
> >  
> > -   uint64_t *prim_counts = brw_bo_map_cpu(brw, obj->prim_count_bo, MAP_READ);
> > +   uint64_t *prim_counts = brw_bo_map(brw, obj->prim_count_bo, MAP_READ);
> >  
> >     assert(obj->prim_count_buffer_index % (2 * streams) == 0);
> >     int pairs = obj->prim_count_buffer_index / (2 * streams);
> > diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> > index de93aeb..62d2fe8 100644
> > --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> > +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
> > @@ -100,7 +100,7 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch,
> >  
> >     batch->bo = brw_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
> >     if (has_llc) {
> > -      batch->map = brw_bo_map_cpu(NULL, batch->bo, MAP_READ | MAP_WRITE);
> > +      batch->map = brw_bo_map(NULL, batch->bo, MAP_READ | MAP_WRITE);
> >     }
> >     batch->map_next = batch->map;
> >  
> > @@ -239,7 +239,7 @@ do_batch_dump(struct brw_context *brw)
> >     if (batch->ring != RENDER_RING)
> >        return;
> >  
> > -   void *map = brw_bo_map_cpu(brw, batch->bo, MAP_READ);
> > +   void *map = brw_bo_map(brw, batch->bo, MAP_READ);
> >     if (map == NULL) {
> >        fprintf(stderr,
> >  	      "WARNING: failed to map batchbuffer, "
> > diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
> > index cf6382d..5813989 100644
> > --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
> > +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
> > @@ -395,12 +395,7 @@ brw_map_buffer_range(struct gl_context *ctx,
> >                                                            length +
> >                                                            intel_obj->map_extra[index],
> >                                                            alignment);
> > -      void *map;
> > -      if (brw->has_llc) {
> > -         map = brw_bo_map_cpu(brw, intel_obj->range_map_bo[index], access);
> > -      } else {
> > -         map = brw_bo_map_gtt(brw, intel_obj->range_map_bo[index], access);
> > -      }
> > +      void *map = brw_bo_map(brw, intel_obj->range_map_bo[index], access);
> >        obj->Mappings[index].Pointer = map + intel_obj->map_extra[index];
> >        return obj->Mappings[index].Pointer;
> >     }
> > @@ -412,12 +407,8 @@ brw_map_buffer_range(struct gl_context *ctx,
> >           perf_debug("MapBufferRange with GL_MAP_UNSYNCHRONIZED_BIT stalling (it's actually synchronized on non-LLC platforms)\n");
> >        }
> >        map = brw_bo_map_unsynchronized(brw, intel_obj->buffer);
> > -   } else if (!brw->has_llc && (!(access & GL_MAP_READ_BIT) ||
> > -                              (access & GL_MAP_PERSISTENT_BIT))) {
> > -      map = brw_bo_map_gtt(brw, intel_obj->buffer, access);
> > -      mark_buffer_inactive(intel_obj);
> >     } else {
> > -      map = brw_bo_map_cpu(brw, intel_obj->buffer, access);
> > +      map = brw_bo_map(brw, intel_obj->buffer, access);
> >        mark_buffer_inactive(intel_obj);
> >     }
> 
> It looks like access = (GL_MAP_READ_BIT | GL_MAP_WRITE_BIT) would have
> failed the original !brw->has_llc && ... case, and fall through to the
> brw_bo_map_cpu case.  So...RW mappings would have been CPU before, and
> would now be GTT.  This may be a significant bug fix...

> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch