[Intel-gfx] [PATCH 44/51] drm/i915/selftest: Better error reporting from hangcheck selftest

Matthew Brost matthew.brost at intel.com
Fri Jul 16 20:13:00 UTC 2021


On Fri, Jul 16, 2021 at 01:17:17PM -0700, Matthew Brost wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
> 
> There are many ways in which the hangcheck selftest can fail. Very few
> of them actually printed an error message to say what happened. So,
> fill in the missing messages.
> 
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

Reviewed-by: Matthew Brost <matthew.brost at intel.com>

> ---
>  drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 89 ++++++++++++++++----
>  1 file changed, 72 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index 7aea10aa1fb4..0ed87cc4d063 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -378,6 +378,7 @@ static int igt_reset_nop(void *arg)
>  			ce = intel_context_create(engine);
>  			if (IS_ERR(ce)) {
>  				err = PTR_ERR(ce);
> +				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
>  				break;
>  			}
>  
> @@ -387,6 +388,7 @@ static int igt_reset_nop(void *arg)
>  				rq = intel_context_create_request(ce);
>  				if (IS_ERR(rq)) {
>  					err = PTR_ERR(rq);
> +					pr_err("[%s] Create request failed: %d!\n", engine->name, err);
>  					break;
>  				}
>  
> @@ -401,24 +403,31 @@ static int igt_reset_nop(void *arg)
>  		igt_global_reset_unlock(gt);
>  
>  		if (intel_gt_is_wedged(gt)) {
> +			pr_err("[%s] GT is wedged!\n", engine->name);
>  			err = -EIO;
>  			break;
>  		}
>  
>  		if (i915_reset_count(global) != reset_count + ++count) {
> -			pr_err("Full GPU reset not recorded!\n");
> +			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
> +			       engine->name, i915_reset_count(global), reset_count, count);
>  			err = -EINVAL;
>  			break;
>  		}
>  
>  		err = igt_flush_test(gt->i915);
> -		if (err)
> +		if (err) {
> +			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
>  			break;
> +		}
>  	} while (time_before(jiffies, end_time));
>  	pr_info("%s: %d resets\n", __func__, count);
>  
> -	if (igt_flush_test(gt->i915))
> +	if (igt_flush_test(gt->i915)) {
> +		pr_err("Post flush failed: %d!\n", err);
>  		err = -EIO;
> +	}
> +
>  	return err;
>  }
>  
> @@ -441,8 +450,10 @@ static int igt_reset_nop_engine(void *arg)
>  		int err;
>  
>  		ce = intel_context_create(engine);
> -		if (IS_ERR(ce))
> +		if (IS_ERR(ce)) {
> +			pr_err("[%s] Create context failed: %d!\n", engine->name, err);
>  			return PTR_ERR(ce);
> +		}
>  
>  		reset_count = i915_reset_count(global);
>  		reset_engine_count = i915_reset_engine_count(global, engine);
> @@ -550,8 +561,10 @@ static int igt_reset_fail_engine(void *arg)
>  		int err;
>  
>  		ce = intel_context_create(engine);
> -		if (IS_ERR(ce))
> +		if (IS_ERR(ce)) {
> +			pr_err("[%s] Create context failed: %d!\n", engine->name, err);
>  			return PTR_ERR(ce);
> +		}
>  
>  		st_engine_heartbeat_disable(engine);
>  		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
> @@ -711,6 +724,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
>  				rq = hang_create_request(&h, engine);
>  				if (IS_ERR(rq)) {
>  					err = PTR_ERR(rq);
> +					pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  					break;
>  				}
>  
> @@ -765,12 +779,16 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
>  			break;
>  
>  		err = igt_flush_test(gt->i915);
> -		if (err)
> +		if (err) {
> +			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
>  			break;
> +		}
>  	}
>  
> -	if (intel_gt_is_wedged(gt))
> +	if (intel_gt_is_wedged(gt)) {
> +		pr_err("GT is wedged!\n");
>  		err = -EIO;
> +	}
>  
>  	if (active)
>  		hang_fini(&h);
> @@ -837,6 +855,7 @@ static int active_engine(void *data)
>  		ce[count] = intel_context_create(engine);
>  		if (IS_ERR(ce[count])) {
>  			err = PTR_ERR(ce[count]);
> +			pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
>  			while (--count)
>  				intel_context_put(ce[count]);
>  			return err;
> @@ -852,6 +871,7 @@ static int active_engine(void *data)
>  		new = intel_context_create_request(ce[idx]);
>  		if (IS_ERR(new)) {
>  			err = PTR_ERR(new);
> +			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
>  			break;
>  		}
>  
> @@ -867,8 +887,10 @@ static int active_engine(void *data)
>  		}
>  
>  		err = active_request_put(old);
> -		if (err)
> +		if (err) {
> +			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
>  			break;
> +		}
>  
>  		cond_resched();
>  	}
> @@ -876,6 +898,9 @@ static int active_engine(void *data)
>  	for (count = 0; count < ARRAY_SIZE(rq); count++) {
>  		int err__ = active_request_put(rq[count]);
>  
> +		if (err)
> +			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
> +
>  		/* Keep the first error */
>  		if (!err)
>  			err = err__;
> @@ -949,6 +974,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
>  					  "igt/%s", other->name);
>  			if (IS_ERR(tsk)) {
>  				err = PTR_ERR(tsk);
> +				pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
>  				goto unwind;
>  			}
>  
> @@ -967,6 +993,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
>  				rq = hang_create_request(&h, engine);
>  				if (IS_ERR(rq)) {
>  					err = PTR_ERR(rq);
> +					pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  					break;
>  				}
>  
> @@ -999,10 +1026,10 @@ static int __igt_reset_engines(struct intel_gt *gt,
>  			if (rq) {
>  				if (rq->fence.error != -EIO) {
>  					pr_err("i915_reset_engine(%s:%s):"
> -					       " failed to reset request %llx:%lld\n",
> +					       " failed to reset request %lld:%lld [0x%04X]\n",
>  					       engine->name, test_name,
>  					       rq->fence.context,
> -					       rq->fence.seqno);
> +					       rq->fence.seqno, rq->context->guc_id);
>  					i915_request_put(rq);
>  
>  					GEM_TRACE_DUMP();
> @@ -1101,8 +1128,10 @@ static int __igt_reset_engines(struct intel_gt *gt,
>  			break;
>  
>  		err = igt_flush_test(gt->i915);
> -		if (err)
> +		if (err) {
> +			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
>  			break;
> +		}
>  	}
>  
>  	if (intel_gt_is_wedged(gt))
> @@ -1180,12 +1209,15 @@ static int igt_reset_wait(void *arg)
>  	igt_global_reset_lock(gt);
>  
>  	err = hang_init(&h, gt);
> -	if (err)
> +	if (err) {
> +		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
>  		goto unlock;
> +	}
>  
>  	rq = hang_create_request(&h, engine);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
> +		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  		goto fini;
>  	}
>  
> @@ -1310,12 +1342,15 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
>  	/* Check that we can recover an unbind stuck on a hanging request */
>  
>  	err = hang_init(&h, gt);
> -	if (err)
> +	if (err) {
> +		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
>  		return err;
> +	}
>  
>  	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
>  	if (IS_ERR(obj)) {
>  		err = PTR_ERR(obj);
> +		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
>  		goto fini;
>  	}
>  
> @@ -1330,12 +1365,14 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
>  	arg.vma = i915_vma_instance(obj, vm, NULL);
>  	if (IS_ERR(arg.vma)) {
>  		err = PTR_ERR(arg.vma);
> +		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
>  		goto out_obj;
>  	}
>  
>  	rq = hang_create_request(&h, engine);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
> +		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  		goto out_obj;
>  	}
>  
> @@ -1347,6 +1384,7 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
>  	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
>  	if (err) {
>  		i915_request_add(rq);
> +		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
>  		goto out_obj;
>  	}
>  
> @@ -1363,8 +1401,14 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
>  	i915_vma_lock(arg.vma);
>  	err = i915_request_await_object(rq, arg.vma->obj,
>  					flags & EXEC_OBJECT_WRITE);
> -	if (err == 0)
> +	if (err == 0) {
>  		err = i915_vma_move_to_active(arg.vma, rq, flags);
> +		if (err)
> +			pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
> +	} else {
> +		pr_err("[%s] Request await failed: %d!\n", engine->name, err);
> +	}
> +
>  	i915_vma_unlock(arg.vma);
>  
>  	if (flags & EXEC_OBJECT_NEEDS_FENCE)
> @@ -1392,6 +1436,7 @@ static int __igt_reset_evict_vma(struct intel_gt *gt,
>  	tsk = kthread_run(fn, &arg, "igt/evict_vma");
>  	if (IS_ERR(tsk)) {
>  		err = PTR_ERR(tsk);
> +		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
>  		tsk = NULL;
>  		goto out_reset;
>  	}
> @@ -1518,6 +1563,7 @@ static int igt_reset_queue(void *arg)
>  		prev = hang_create_request(&h, engine);
>  		if (IS_ERR(prev)) {
>  			err = PTR_ERR(prev);
> +			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
>  			goto fini;
>  		}
>  
> @@ -1532,6 +1578,7 @@ static int igt_reset_queue(void *arg)
>  			rq = hang_create_request(&h, engine);
>  			if (IS_ERR(rq)) {
>  				err = PTR_ERR(rq);
> +				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  				goto fini;
>  			}
>  
> @@ -1619,8 +1666,10 @@ static int igt_reset_queue(void *arg)
>  		i915_request_put(prev);
>  
>  		err = igt_flush_test(gt->i915);
> -		if (err)
> +		if (err) {
> +			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
>  			break;
> +		}
>  	}
>  
>  fini:
> @@ -1653,12 +1702,15 @@ static int igt_handle_error(void *arg)
>  		return 0;
>  
>  	err = hang_init(&h, gt);
> -	if (err)
> +	if (err) {
> +		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
>  		return err;
> +	}
>  
>  	rq = hang_create_request(&h, engine);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
> +		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  		goto err_fini;
>  	}
>  
> @@ -1743,12 +1795,15 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
>  		return err;
>  
>  	err = hang_init(&h, engine->gt);
> -	if (err)
> +	if (err) {
> +		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
>  		return err;
> +	}
>  
>  	rq = hang_create_request(&h, engine);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
> +		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
>  		goto out;
>  	}
>  
> -- 
> 2.28.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx


More information about the Intel-gfx mailing list