[Mesa-dev] [PATCH 04/18] i965: Introduce a context-local batch manager

Tue Jul 7 03:14:53 PDT 2015

On 07/06/2015 01:33 PM, Chris Wilson wrote:

> +/*
> + * Add a relocation entry for the target buffer into the current batch.
> + *
> + * This is the heart of performing fast relocations, both here and in
> + * the corresponding kernel relocation routines.
> + *
> + * - Instead of passing in handles for the kernel convert back into
> + *   the buffer for every relocation, we tell the kernel which
> + *   execobject slot corresponds with the relocation. The kernel is
> + *   able to use a simple LUT constructed as it first looks up each buffer
> + *   for the batch rather than search a small, overfull hashtable. As both
> + *   the number of relocations and buffers in a batch grow, the simple
> + *   LUT is much more efficient (though the LUT itself is less cache
> + *   friendly).
> + *   However, as the batch buffer is by definition the last object in
> + *   the execbuffer array we have to perform a pass to relabel the
> + *   target of all relocations pointing to the batch. (Except when
> + *   the kernel supports batch-first, in which case we can do the relocation
> + *   target processing for the batch inline.)
> + *
> + * - If the kernel has not moved the buffer, it will still be in the same
> + *   location as last time we used it. If we tell the kernel that all the
> + *   relocation entries are the same as the offset for the buffer, then
> + *   the kernel need only check that all the buffers are still in the same
> + *   location and then skip performing relocations entirely. A huge win.
> + *
> + * - As a consequence of telling the kernel to skip processing the relocations,
> + *   we need to tell the kernel about the read/write domains and special needs
> + *   of the buffers.
> + *
> + * - Alternatively, we can request the kernel place the buffer exactly
> + *   where we want it and forgo all relocations to that buffer entirely.
> + *   The buffer is effectively pinned for its lifetime (if the kernel
> + *   does have to move it, for example to swap it out to recover memory,
> + *   the kernel will return it back to our requested location at the start
> + *   of the next batch.) This of course imposes a lot of constraints on where
> + *   we can say the buffers are, they must meet all the alignment constraints
> + *   and not overlap.
> + *
> + * - Essential to all these techniques is that we always use the same
> + *   presumed_offset for the relocations as for submitting the execobject.
> + *   That value must be written into the batch and it must match the value
> + *   we tell the kernel. (This breaks down when using relocation tries shared
> + *   between multiple contexts, hence the need for context-local batch
> + *   management.)
> + *
> + * In contrast to libdrm, we can build the execbuffer array along with
> + * the batch by forgoing the ability to handle general relocation trees.
> + * This avoids having multiple passes to build the execbuffer parameter,
> + * and also gives us a means to cheaply track when a buffer has been
> + * referenced by the batch.
> + */
> +uint64_t __brw_batch_reloc(struct brw_batch *batch,
> +                           uint32_t batch_offset,
> +                           struct brw_bo *target_bo,
> +                           uint64_t target_offset,
> +                           unsigned read_domains,
> +                           unsigned write_domain)
> +{
> +   assert(target_bo->refcnt);
> +   if (unlikely(target_bo->batch != batch)) {
> +      /* XXX legal sharing between contexts/threads? */
> +      target_bo = brw_bo_import(batch, target_bo->base, true);
> +      if (unlikely(target_bo == NULL))
> +         longjmp(batch->jmpbuf, -ENOMEM);
> +      target_bo->refcnt--; /* kept alive by the implicit active reference */
> +   }
> +   assert(target_bo->batch == batch);
> +
> +   if (target_bo->exec == NULL) {
> +      int n;
> +
> +      /* reserve one exec entry for the batch */
> +      if (unlikely(batch->emit.nexec + 1 == batch->exec_size))
> +         __brw_batch_grow_exec(batch);
> +
> +      n = batch->emit.nexec++;
> +      target_bo->target_handle = has_lut(batch) ? n : target_bo->handle;
> +      target_bo->exec = memset(batch->exec + n, 0, sizeof(*target_bo->exec));
> +      target_bo->exec->handle = target_bo->handle;
> +      target_bo->exec->alignment = target_bo->alignment;
> +      target_bo->exec->offset = target_bo->offset;
> +      if (target_bo->pinned)
> +         target_bo->exec->flags = EXEC_OBJECT_PINNED;
> +
> +      /* Track the total amount of memory in use by all active requests */
> +      if (target_bo->read.rq == NULL) {
> +	      batch->rss += target_bo->size;
> +	      if (batch->rss > batch->peak_rss)
> +		      batch->peak_rss = batch->rss;
> +      }
> +      target_bo->read.rq = batch->next_request;
> +      list_move_tail(&target_bo->read.link, &batch->next_request->read);
> +
> +      batch->aperture += target_bo->size;
> +   }
> +
> +   if (!target_bo->pinned) {
> +      int n;
> +
> +      if (unlikely(batch->emit.nreloc == batch->reloc_size))
> +         __brw_batch_grow_reloc(batch);
> +
> +      n = batch->emit.nreloc++;
> +      batch->reloc[n].offset = batch_offset;
> +      batch->reloc[n].delta = target_offset;
> +      batch->reloc[n].target_handle = target_bo->target_handle;
> +      batch->reloc[n].presumed_offset = target_bo->offset;
> +      batch->reloc[n].read_domains = read_domains;
> +      batch->reloc[n].write_domain = write_domain;
> +
> +      /* If we haven't added the batch to the execobject array yet, we
> +       * will have to process all the relocations pointing to the
> +       * batch when finalizing the request for submission.
> +       */
> +      if (target_bo->target_handle == -1) {
> +         int m = batch->emit.nself++;
> +         if (m < 256)
> +            batch->self_reloc[m] = n;
> +      }
> +   }
> +
> +   if (write_domain && !target_bo->dirty) {
> +      assert(target_bo != batch->bo);
> +      target_bo->write.rq = batch->next_request;
> +      list_move(&target_bo->write.link, &batch->next_request->write);
> +      assert(target_bo->write.rq == target_bo->read.rq);
> +      target_bo->dirty = true;
> +      target_bo->domain = DOMAIN_GPU;
> +      if (has_lut(batch)) {
> +         target_bo->exec->flags |= EXEC_OBJECT_WRITE;
> +         if (write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
> +             batch->needs_pipecontrol_ggtt_wa)
> +            target_bo->exec->flags |= EXEC_OBJECT_NEEDS_GTT;
> +      }
> +   }
> +
> +   return target_bo->offset + target_offset;
> +}

I looked at your Surface State setup changes in:

brw_wm_surface_state.c
gen6_blorp.cpp
gen6_surface_state.c
gen7_blorp.cpp
gen7_wm_surface_state.c
gen8_surface_state.c

and I especially like the removal of the additional to call
drm_intel_bo_emit_reloc() just to mark those surfaces for relocation
after specifying the every Surface Base Address. The new setup certainly
looks more intuitive! I see no issues in those sections, except:

>  
>  /* Creates a new WM constant buffer reflecting the current fragment program's
> @@ -560,7 +553,7 @@ brw_emit_null_surface_state(struct brw_context *brw,
>      *     - Surface Format must be R8G8B8A8_UNORM.
>      */
>     unsigned surface_type = BRW_SURFACE_NULL;
> -   drm_intel_bo *bo = NULL;
> +   struct brw_bo *bo = NULL;
>     unsigned pitch_minus_1 = 0;
>     uint32_t multisampling_state = 0;
>     uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32,
> @@ -600,7 +593,10 @@ brw_emit_null_surface_state(struct brw_context *brw,
>  		  1 << BRW_SURFACE_WRITEDISABLE_B_SHIFT |
>  		  1 << BRW_SURFACE_WRITEDISABLE_A_SHIFT);
>     }
> -   surf[1] = bo ? bo->offset64 : 0;
> +   surf[1] = brw_batch_reloc(&brw->batch, *out_offset + 4,
> +                             bo, 0,
> +                             I915_GEM_DOMAIN_RENDER,
> +                             I915_GEM_DOMAIN_RENDER);

null check for bo?

>     surf[2] = ((width - 1) << BRW_SURFACE_WIDTH_SHIFT |
>                (height - 1) << BRW_SURFACE_HEIGHT_SHIFT);
>  
> @@ -613,13 +609,6 @@ brw_emit_null_surface_state(struct brw_context *brw,
>                pitch_minus_1 << BRW_SURFACE_PITCH_SHIFT);
>     surf[4] = multisampling_state;
>     surf[5] = 0;
> -
> -   if (bo) {
> -      drm_intel_bo_emit_reloc(brw->batch.bo,
> -                              *out_offset + 4,
> -                              bo, 0,
> -                              I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
> -   }
>  }
>