[Intel-gfx] [PATCH 1/3] drm/i915: Rely on accurate request tracking for finding hung batches
Ben Widawsky
ben at bwidawsk.net
Wed Feb 12 02:57:19 CET 2014
On Mon, Feb 10, 2014 at 04:30:49PM +0200, Mika Kuoppala wrote:
> From: Chris Wilson <chris at chris-wilson.co.uk>
>
> In the past, it was possible to have multiple batches per request due to
> a stray signal or ENOMEM. As a result we had to scan each active object
> (filtered by those having the COMMAND domain) for the one that contained
> the ACTHD pointer. This was then made more complicated by the
> introduction of ppgtt, whereby ACTHD then pointed into the address space
> of the context and so also needed to be taken into account.
>
> This is a fairly robust approach (though the implementation is a little
> fragile and depends upon the per-generation setup, registers and
> parameters). However, due to the requirements for hangstats, we needed a
> robust method for associating batches with a particular request and
> having that we can rely upon it for finding the associated batch object
> for error capture.
>
> If the batch buffer tracking is not robust enough, that should become
> apparent quite quickly through an erroneous error capture. That should
> also help to make sure that the runtime reporting to userspace is
> robust. It also means that we then report the oldest incomplete batch on
> each ring, which can be useful for determining the state of userspace at
> the time of a hang.
>
> v2: Use i915_gem_find_active_request (Mika)
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk> (v1)
> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com> (v2)
> Cc: Ben Widawsky <benjamin.widawsky at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.h | 3 ++
> drivers/gpu/drm/i915/i915_gem.c | 13 +++++--
> drivers/gpu/drm/i915/i915_gpu_error.c | 66 ++++-----------------------------
> 3 files changed, 20 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b1e91c3..eb260bc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2147,6 +2147,9 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
> }
> }
>
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring);
> +
> bool i915_gem_retire_requests(struct drm_device *dev);
> void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
> int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b0a244a..20e55ef 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2298,11 +2298,16 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
> kfree(request);
> }
>
> -static struct drm_i915_gem_request *
> -i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
> +struct drm_i915_gem_request *
> +i915_gem_find_active_request(struct intel_ring_buffer *ring)
> {
> struct drm_i915_gem_request *request;
> - const u32 completed_seqno = ring->get_seqno(ring, false);
> + u32 completed_seqno;
> +
> + if (WARN_ON(!ring->get_seqno))
> + return NULL;
ring->get_seqno(ring, false) ?
This looks like it's currently wrong below as well.
> +
> + completed_seqno = ring->get_seqno(ring, false);
>
> list_for_each_entry(request, &ring->request_list, list) {
> if (i915_seqno_passed(completed_seqno, request->seqno))
> @@ -2320,7 +2325,7 @@ static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
> struct drm_i915_gem_request *request;
> bool ring_hung;
>
> - request = i915_gem_find_first_non_complete(ring);
> + request = i915_gem_find_active_request(ring);
>
> if (request == NULL)
> return;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index dc47bb9..5bd075a 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -713,46 +713,14 @@ static void i915_gem_record_fences(struct drm_device *dev,
> }
> }
>
> -/* This assumes all batchbuffers are executed from the PPGTT. It might have to
> - * change in the future. */
> -static bool is_active_vm(struct i915_address_space *vm,
> - struct intel_ring_buffer *ring)
> -{
> - struct drm_device *dev = vm->dev;
> - struct drm_i915_private *dev_priv = dev->dev_private;
> - struct i915_hw_ppgtt *ppgtt;
> -
> - if (INTEL_INFO(dev)->gen < 7)
> - return i915_is_ggtt(vm);
> -
> - /* FIXME: This ignores that the global gtt vm is also on this list. */
> - ppgtt = container_of(vm, struct i915_hw_ppgtt, base);
> -
> - if (INTEL_INFO(dev)->gen >= 8) {
> - u64 pdp0 = (u64)I915_READ(GEN8_RING_PDP_UDW(ring, 0)) << 32;
> - pdp0 |= I915_READ(GEN8_RING_PDP_LDW(ring, 0));
> - return pdp0 == ppgtt->pd_dma_addr[0];
> - } else {
> - u32 pp_db;
> - pp_db = I915_READ(RING_PP_DIR_BASE(ring));
> - return (pp_db >> 10) == ppgtt->pd_offset;
> - }
> -}
> -
> static struct drm_i915_error_object *
> i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
> struct intel_ring_buffer *ring)
> {
> - struct i915_address_space *vm;
> - struct i915_vma *vma;
> - struct drm_i915_gem_object *obj;
> - bool found_active = false;
> - u32 seqno;
> -
> - if (!ring->get_seqno)
> - return NULL;
> + struct drm_i915_gem_request *request;
>
> if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
> + struct drm_i915_gem_object *obj;
> u32 acthd = I915_READ(ACTHD);
>
> if (WARN_ON(ring->id != RCS))
> @@ -765,32 +733,14 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
> return i915_error_ggtt_object_create(dev_priv, obj);
> }
>
> - seqno = ring->get_seqno(ring, false);
> - list_for_each_entry(vm, &dev_priv->vm_list, global_link) {
> - if (!is_active_vm(vm, ring))
> - continue;
> -
> - found_active = true;
> -
> - list_for_each_entry(vma, &vm->active_list, mm_list) {
> - obj = vma->obj;
> - if (obj->ring != ring)
> - continue;
> -
> - if (i915_seqno_passed(seqno, obj->last_read_seqno))
> - continue;
> -
> - if ((obj->base.read_domains & I915_GEM_DOMAIN_COMMAND) == 0)
> - continue;
> -
> - /* We need to copy these to an anonymous buffer as the simplest
> - * method to avoid being overwritten by userspace.
> - */
> - return i915_error_object_create(dev_priv, obj, vm);
> - }
> + request = i915_gem_find_active_request(ring);
> + if (request) {
> + /* We need to copy these to an anonymous buffer as the simplest
> + * method to avoid being overwritten by userspace.
> + */
> + return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
Wrap this one. It's too long even for my undiscerning eye.
> }
>
> - WARN_ON(!found_active);
> return NULL;
> }
>
I still would have preferred to keep both methods for a while until we
felt really comfortable with this... the commit message's statement that
getting bad error state is inevitably a good thing is total bunk, I
think.
However, I'll act as a Daniel proxy here who seems in favor of this
path. The code is definitely simpler, I am just a chicken.
With the above fixed:
Reviewed-by: Ben Widawsky <ben at bwidawsk.net>
--
Ben Widawsky, Intel Open Source Technology Center
More information about the Intel-gfx
mailing list