[Intel-gfx] [PATCH 2/2] drm/i915: Print captured bo for all VM in error state

Ben Widawsky ben at bwidawsk.net
Fri Jan 10 20:59:10 CET 2014


On Thu, Jan 09, 2014 at 10:57:23PM +0000, Chris Wilson wrote:
> The current error state harks back to the era of just a single VM. For
> full-ppgtt, we capture every bo on every VM. It behoves us to then print
> every bo for every VM, which we currently fail to do and so miss vital
> information in the error state.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>

I'm somewhat disappointed with myself since I know I had a lot of this
at one point in time. Thanks for finding/fixing it.

> ---
>  drivers/gpu/drm/i915/i915_drv.h       |  4 +++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 66 ++++++++++++++++++++++++++---------
>  2 files changed, 54 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8f0ef8645ea9..673aaee527eb 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -331,6 +331,7 @@ struct drm_i915_error_state {
>  	u32 faddr[I915_NUM_RINGS];
>  	u64 fence[I915_MAX_NUM_FENCES];
>  	struct timeval time;
> +
>  	struct drm_i915_error_ring {
>  		struct drm_i915_error_object {
>  			int page_count;
> @@ -344,6 +345,7 @@ struct drm_i915_error_state {
>  		} *requests;
>  		int num_requests;
>  	} ring[I915_NUM_RINGS];
> +

Not sure if Daniel cares about the whitespace insertions. I do not.

>  	struct drm_i915_error_buffer {
>  		u32 size;
>  		u32 name;
> @@ -361,6 +363,8 @@ struct drm_i915_error_state {
>  		u32 cache_level:3;
>  	} **active_bo, **pinned_bo;
>  	u32 *active_bo_count, *pinned_bo_count;
> +	u32 vm_count;
> +

One patch which I never submitted held vm_count in dev_priv. I found it
useful at one time, but I apparently forgot about it.

>  	struct intel_overlay_error_state *overlay;
>  	struct intel_display_error_state *display;
>  	int hangcheck_score[I915_NUM_RINGS];
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index cce9f559e3d7..b7ead8ce6628 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -188,10 +188,10 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
>  				struct drm_i915_error_buffer *err,
>  				int count)
>  {
> -	err_printf(m, "%s [%d]:\n", name, count);
> +	err_printf(m, "  %s [%d]:\n", name, count);
>  
>  	while (count--) {
> -		err_printf(m, "  %08x %8u %02x %02x %x %x",
> +		err_printf(m, "    %08x %8u %02x %02x %x %x",
>  			   err->gtt_offset,
>  			   err->size,
>  			   err->read_domains,
> @@ -332,15 +332,17 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	for_each_ring(ring, dev_priv, i)
>  		i915_ring_error_state(m, dev, error, i);
>  
> -	if (error->active_bo)
> +	for (i = 0; i < error->vm_count; i++) {
> +		err_printf(m, "vm[%d]\n", i);
> +
>  		print_error_buffers(m, "Active",
> -				    error->active_bo[0],
> -				    error->active_bo_count[0]);
> +				    error->active_bo[i],
> +				    error->active_bo_count[i]);
>  
> -	if (error->pinned_bo)
>  		print_error_buffers(m, "Pinned",
> -				    error->pinned_bo[0],
> -				    error->pinned_bo_count[0]);
> +				    error->pinned_bo[i],
> +				    error->pinned_bo_count[i]);
> +	}
>  
>  	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
>  		struct drm_i915_error_object *obj;
> @@ -606,13 +608,23 @@ static u32 capture_active_bo(struct drm_i915_error_buffer *err,
>  }
>  
>  static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
> -			     int count, struct list_head *head)
> +			     int count, struct list_head *head,
> +			     struct i915_address_space *vm)
>  {
>  	struct drm_i915_gem_object *obj;
>  	int i = 0;
>  
>  	list_for_each_entry(obj, head, global_list) {
> -		if (!i915_gem_obj_is_pinned(obj))
> +		struct i915_vma *vma;
> +		bool bound = false;
> +
> +		list_for_each_entry(vma, &obj->vma_list, vma_link)
> +			if (vma->vm == vm && vma->pin_count > 0) {
> +				bound = true;
> +				break;
> +			}
> +
> +		if (!bound)
>  			continue;
>  
>  		capture_bo(err++, obj);
> @@ -876,9 +888,14 @@ static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
>  	list_for_each_entry(vma, &vm->active_list, mm_list)
>  		i++;
>  	error->active_bo_count[ndx] = i;
> -	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
> -		if (i915_gem_obj_is_pinned(obj))
> -			i++;
> +
> +	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
> +		list_for_each_entry(vma, &obj->vma_list, vma_link)
> +			if (vma->vm == vm && vma->pin_count > 0) {
> +				i++;
> +				break;
> +			}
> +	}

FWIW, the old code predated per VMA pin counts.

>  	error->pinned_bo_count[ndx] = i - error->active_bo_count[ndx];
>  
>  	if (i) {
> @@ -897,7 +914,7 @@ static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
>  		error->pinned_bo_count[ndx] =
>  			capture_pinned_bo(pinned_bo,
>  					  error->pinned_bo_count[ndx],
> -					  &dev_priv->mm.bound_list);
> +					  &dev_priv->mm.bound_list, vm);

With my original patches which I obviously didn't submit (see top), I
made capture_pinned_vma() and capture_active_vma(). I think it fits
better with Daniel's ideal of killing <obj, vm>

>  	error->active_bo[ndx] = active_bo;
>  	error->pinned_bo[ndx] = pinned_bo;
>  }
> @@ -918,8 +935,25 @@ static void i915_gem_capture_buffers(struct drm_i915_private *dev_priv,
>  	error->pinned_bo_count = kcalloc(cnt, sizeof(*error->pinned_bo_count),
>  					 GFP_ATOMIC);
>  
> -	list_for_each_entry(vm, &dev_priv->vm_list, global_link)
> -		i915_gem_capture_vm(dev_priv, error, vm, i++);
> +	if (error->active_bo == NULL ||
> +	    error->pinned_bo == NULL ||
> +	    error->active_bo_count == NULL ||
> +	    error->pinned_bo_count == NULL) {
> +		kfree(error->active_bo);
> +		kfree(error->active_bo_count);
> +		kfree(error->pinned_bo);
> +		kfree(error->pinned_bo_count);
> +
> +		error->active_bo = NULL;
> +		error->active_bo_count = NULL;
> +		error->pinned_bo = NULL;
> +		error->pinned_bo_count = NULL;

This is really a separate fix/patch.

> +	} else {
> +		list_for_each_entry(vm, &dev_priv->vm_list, global_link)
> +			i915_gem_capture_vm(dev_priv, error, vm, i++);
> +
> +		error->vm_count = cnt;
> +	}

One other change which I apparently never pushed was to try to identify
the guilty context/VM (instead of the global as you do for patch 1 of
the series). that way foo_bo[0] was always the interesting one.

>  }
>  
>  /**
> -- 
> 1.8.5.2

Both patches are 
Reviewed-by: Ben Widawsky <ben at bwidawsk.net>

I will gladly re-review if you make any of my suggested changes.

-- 
Ben Widawsky, Intel Open Source Technology Center



More information about the Intel-gfx mailing list