[Intel-gfx] [PATCH 2/8] drm/i915: Complete the fences as they are cancelled due to wedging

Mon Dec 3 17:11:59 UTC 2018

On 03/12/2018 11:36, Chris Wilson wrote:
> We inspect the requests under the assumption that they will be marked as
> completed when they are removed from the queue. Currently however, in the
> process of wedging the requests will be removed from the queue before they
> are completed, so rearrange the code to complete the fences before the
> locks are dropped.
> 
> <1>[  354.473346] BUG: unable to handle kernel NULL pointer dereference at 0000000000000250
> <6>[  354.473363] PGD 0 P4D 0
> <4>[  354.473370] Oops: 0000 [#1] PREEMPT SMP PTI
> <4>[  354.473380] CPU: 0 PID: 4470 Comm: gem_eio Tainted: G     U            4.20.0-rc4-CI-CI_DRM_5216+ #1
> <4>[  354.473393] Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0027.2018.0125.1347 01/25/2018
> <4>[  354.473480] RIP: 0010:__i915_schedule+0x311/0x5e0 [i915]
> <4>[  354.473490] Code: 49 89 44 24 20 4d 89 4c 24 28 4d 89 29 44 39 b3 a0 04 00 00 7d 3a 41 8b 44 24 78 85 c0 74 13 48 8b 93 78 04 00 00 48 83 e2 fc <39> 82 50 02 00 00 79 1e 44 89 b3 a0 04 00 00 48 8d bb d0 03 00 00

This confuses me, isn't the code segment usually at the end? And then 
you have another after the call trace which doesn't match 
__i915_scheduel.. anyways, _this_ code seems to be this part:

                 if (node_to_request(node)->global_seqno &&
  90d:   8b 43 78                mov    eax,DWORD PTR [rbx+0x78]
  910:   85 c0                   test   eax,eax
  912:   74 13                   je     927 <__i915_schedule+0x317>

i915_seqno_passed(port_request(engine->execlists.port)->global_seqno,
  914:   49 8b 97 c0 04 00 00    mov    rdx,QWORD PTR [r15+0x4c0]
  91b:   48 83 e2 fc             and    rdx,0xfffffffffffffffc
                 if (node_to_request(node)->global_seqno &&
  91f:   39 82 50 02 00 00       cmp    DWORD PTR [rdx+0x250],eax
  925:   79 1e                   jns    945 <__i915_schedule+0x335>

> <4>[  354.473515] RSP: 0018:ffffc900001bba90 EFLAGS: 00010046
> <4>[  354.473524] RAX: 0000000000000003 RBX: ffff8882624c8008 RCX: f34a737800000000
> <4>[  354.473535] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8882624c8048
> <4>[  354.473545] RBP: ffffc900001bbab0 R08: 000000005963f1f1 R09: 0000000000000000
> <4>[  354.473556] R10: ffffc900001bba10 R11: ffff8882624c8060 R12: ffff88824fdd7b98
> <4>[  354.473567] R13: ffff88824fdd7bb8 R14: 0000000000000001 R15: ffff88824fdd7750
> <4>[  354.473578] FS:  00007f44b4b5b980(0000) GS:ffff888277e00000(0000) knlGS:0000000000000000
> <4>[  354.473590] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[  354.473599] CR2: 0000000000000250 CR3: 000000026976e000 CR4: 0000000000340ef0

Given the registers above, I think it means this - eax is global_seqno 
of the node rq. rdx is is port_request so NULL and bang. No request in 
port, but why would there always be one at the point we are scheduling 
in a new request to the runnable queue?

Regards,

Tvrtko

> <4>[  354.473611] Call Trace:
> <4>[  354.473622]  ? lock_acquire+0xa6/0x1c0
> <4>[  354.473677]  ? i915_schedule_bump_priority+0x57/0xd0 [i915]
> <4>[  354.473736]  i915_schedule_bump_priority+0x72/0xd0 [i915]
> <4>[  354.473792]  i915_request_wait+0x4db/0x840 [i915]
> <4>[  354.473804]  ? get_pwq.isra.4+0x2c/0x50
> <4>[  354.473813]  ? ___preempt_schedule+0x16/0x18
> <4>[  354.473824]  ? wake_up_q+0x70/0x70
> <4>[  354.473831]  ? wake_up_q+0x70/0x70
> <4>[  354.473882]  ? gen6_rps_boost+0x118/0x120 [i915]
> <4>[  354.473936]  i915_gem_object_wait_fence+0x8a/0x110 [i915]
> <4>[  354.473991]  i915_gem_object_wait+0x113/0x500 [i915]
> <4>[  354.474047]  i915_gem_wait_ioctl+0x11c/0x2f0 [i915]
> <4>[  354.474101]  ? i915_gem_unset_wedged+0x210/0x210 [i915]
> <4>[  354.474113]  drm_ioctl_kernel+0x81/0xf0
> <4>[  354.474123]  drm_ioctl+0x2de/0x390
> <4>[  354.474175]  ? i915_gem_unset_wedged+0x210/0x210 [i915]
> <4>[  354.474187]  ? finish_task_switch+0x95/0x260
> <4>[  354.474197]  ? lock_acquire+0xa6/0x1c0
> <4>[  354.474207]  do_vfs_ioctl+0xa0/0x6e0
> <4>[  354.474217]  ? __fget+0xfc/0x1e0
> <4>[  354.474225]  ksys_ioctl+0x35/0x60
> <4>[  354.474233]  __x64_sys_ioctl+0x11/0x20
> <4>[  354.474241]  do_syscall_64+0x55/0x190
> <4>[  354.474251]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> <4>[  354.474260] RIP: 0033:0x7f44b3de65d7
> <4>[  354.474267] Code: b3 66 90 48 8b 05 b1 48 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 81 48 2d 00 f7 d8 64 89 01 48
> <4>[  354.474293] RSP: 002b:00007fff974948e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> <4>[  354.474305] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f44b3de65d7
> <4>[  354.474316] RDX: 00007fff97494940 RSI: 00000000c010646c RDI: 0000000000000007
> <4>[  354.474327] RBP: 00007fff97494940 R08: 0000000000000000 R09: 00007f44b40bbc40
> <4>[  354.474337] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000c010646c
> <4>[  354.474348] R13: 0000000000000007 R14: 0000000000000000 R15: 0000000000000000
> 
> v2: Avoid floating requests.
> v3: Can't call dma_fence_signal() under the timeline lock!
> v4: Can't call dma_fence_signal() from inside another fence either.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem.c         | 54 +++++--------------------
>   drivers/gpu/drm/i915/intel_lrc.c        | 13 ++++--
>   drivers/gpu/drm/i915/intel_ringbuffer.c | 13 +++++-
>   3 files changed, 31 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c55b1f75c980..834240a9b262 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3308,16 +3308,6 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
>   }
>   
>   static void nop_submit_request(struct i915_request *request)
> -{
> -	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
> -		  request->engine->name,
> -		  request->fence.context, request->fence.seqno);
> -	dma_fence_set_error(&request->fence, -EIO);
> -
> -	i915_request_submit(request);
> -}
> -
> -static void nop_complete_submit_request(struct i915_request *request)
>   {
>   	unsigned long flags;
>   
> @@ -3354,57 +3344,33 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>   	 * rolling the global seqno forward (since this would complete requests
>   	 * for which we haven't set the fence error to EIO yet).
>   	 */
> -	for_each_engine(engine, i915, id) {
> +	for_each_engine(engine, i915, id)
>   		i915_gem_reset_prepare_engine(engine);
>   
> -		engine->submit_request = nop_submit_request;
> -		engine->schedule = NULL;
> -	}
> -	i915->caps.scheduler = 0;
> -
>   	/* Even if the GPU reset fails, it should still stop the engines */
>   	if (INTEL_GEN(i915) >= 5)
>   		intel_gpu_reset(i915, ALL_ENGINES);
>   
> -	/*
> -	 * Make sure no one is running the old callback before we proceed with
> -	 * cancelling requests and resetting the completion tracking. Otherwise
> -	 * we might submit a request to the hardware which never completes.
> -	 */
> -	synchronize_rcu();
> -
>   	for_each_engine(engine, i915, id) {
> -		/* Mark all executing requests as skipped */
> -		engine->cancel_requests(engine);
> -
> -		/*
> -		 * Only once we've force-cancelled all in-flight requests can we
> -		 * start to complete all requests.
> -		 */
> -		engine->submit_request = nop_complete_submit_request;
> +		engine->submit_request = nop_submit_request;
> +		engine->schedule = NULL;
>   	}
> +	i915->caps.scheduler = 0;
>   
>   	/*
>   	 * Make sure no request can slip through without getting completed by
>   	 * either this call here to intel_engine_init_global_seqno, or the one
> -	 * in nop_complete_submit_request.
> +	 * in nop_submit_request.
>   	 */
>   	synchronize_rcu();
>   
> -	for_each_engine(engine, i915, id) {
> -		unsigned long flags;
> -
> -		/*
> -		 * Mark all pending requests as complete so that any concurrent
> -		 * (lockless) lookup doesn't try and wait upon the request as we
> -		 * reset it.
> -		 */
> -		spin_lock_irqsave(&engine->timeline.lock, flags);
> -		intel_engine_init_global_seqno(engine,
> -					       intel_engine_last_submit(engine));
> -		spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +	/* Mark all executing requests as skipped */
> +	for_each_engine(engine, i915, id)
> +		engine->cancel_requests(engine);
>   
> +	for_each_engine(engine, i915, id) {
>   		i915_gem_reset_finish_engine(engine);
> +		intel_engine_wakeup(engine);
>   	}
>   
>   out:
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 11f4e6148557..b5511a054f30 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -818,8 +818,11 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   	/* Mark all executing requests as skipped. */
>   	list_for_each_entry(rq, &engine->timeline.requests, link) {
>   		GEM_BUG_ON(!rq->global_seqno);
> -		if (!i915_request_completed(rq))
> -			dma_fence_set_error(&rq->fence, -EIO);
> +
> +		if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
> +			continue;
> +
> +		dma_fence_set_error(&rq->fence, -EIO);
>   	}
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> @@ -830,8 +833,8 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   		priolist_for_each_request_consume(rq, rn, p, i) {
>   			list_del_init(&rq->sched.link);
>   
> -			dma_fence_set_error(&rq->fence, -EIO);
>   			__i915_request_submit(rq);
> +			dma_fence_set_error(&rq->fence, -EIO);
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> @@ -839,6 +842,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   			kmem_cache_free(engine->i915->priorities, p);
>   	}
>   
> +	intel_write_status_page(engine,
> +				I915_GEM_HWS_INDEX,
> +				intel_engine_last_submit(engine));
> +
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority = INT_MIN;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 28ae1e436ea6..992889f9e0ff 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -748,9 +748,18 @@ static void cancel_requests(struct intel_engine_cs *engine)
>   	/* Mark all submitted requests as skipped. */
>   	list_for_each_entry(request, &engine->timeline.requests, link) {
>   		GEM_BUG_ON(!request->global_seqno);
> -		if (!i915_request_completed(request))
> -			dma_fence_set_error(&request->fence, -EIO);
> +
> +		if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> +			     &request->fence.flags))
> +			continue;
> +
> +		dma_fence_set_error(&request->fence, -EIO);
>   	}
> +
> +	intel_write_status_page(engine,
> +				I915_GEM_HWS_INDEX,
> +				intel_engine_last_submit(engine));
> +
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>