[Intel-gfx] [PATCH 2/8] drm/i915: Complete the fences as they are cancelled due to wedging
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Mon Dec 3 17:11:59 UTC 2018
On 03/12/2018 11:36, Chris Wilson wrote:
> We inspect the requests under the assumption that they will be marked as
> completed when they are removed from the queue. Currently however, in the
> process of wedging the requests will be removed from the queue before they
> are completed, so rearrange the code to complete the fences before the
> locks are dropped.
>
> <1>[ 354.473346] BUG: unable to handle kernel NULL pointer dereference at 0000000000000250
> <6>[ 354.473363] PGD 0 P4D 0
> <4>[ 354.473370] Oops: 0000 [#1] PREEMPT SMP PTI
> <4>[ 354.473380] CPU: 0 PID: 4470 Comm: gem_eio Tainted: G U 4.20.0-rc4-CI-CI_DRM_5216+ #1
> <4>[ 354.473393] Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0027.2018.0125.1347 01/25/2018
> <4>[ 354.473480] RIP: 0010:__i915_schedule+0x311/0x5e0 [i915]
> <4>[ 354.473490] Code: 49 89 44 24 20 4d 89 4c 24 28 4d 89 29 44 39 b3 a0 04 00 00 7d 3a 41 8b 44 24 78 85 c0 74 13 48 8b 93 78 04 00 00 48 83 e2 fc <39> 82 50 02 00 00 79 1e 44 89 b3 a0 04 00 00 48 8d bb d0 03 00 00
This confuses me, isn't the code segment usually at the end? And then
you have another after the call trace which doesn't match
__i915_scheduel.. anyways, _this_ code seems to be this part:
if (node_to_request(node)->global_seqno &&
90d: 8b 43 78 mov eax,DWORD PTR [rbx+0x78]
910: 85 c0 test eax,eax
912: 74 13 je 927 <__i915_schedule+0x317>
i915_seqno_passed(port_request(engine->execlists.port)->global_seqno,
914: 49 8b 97 c0 04 00 00 mov rdx,QWORD PTR [r15+0x4c0]
91b: 48 83 e2 fc and rdx,0xfffffffffffffffc
if (node_to_request(node)->global_seqno &&
91f: 39 82 50 02 00 00 cmp DWORD PTR [rdx+0x250],eax
925: 79 1e jns 945 <__i915_schedule+0x335>
> <4>[ 354.473515] RSP: 0018:ffffc900001bba90 EFLAGS: 00010046
> <4>[ 354.473524] RAX: 0000000000000003 RBX: ffff8882624c8008 RCX: f34a737800000000
> <4>[ 354.473535] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8882624c8048
> <4>[ 354.473545] RBP: ffffc900001bbab0 R08: 000000005963f1f1 R09: 0000000000000000
> <4>[ 354.473556] R10: ffffc900001bba10 R11: ffff8882624c8060 R12: ffff88824fdd7b98
> <4>[ 354.473567] R13: ffff88824fdd7bb8 R14: 0000000000000001 R15: ffff88824fdd7750
> <4>[ 354.473578] FS: 00007f44b4b5b980(0000) GS:ffff888277e00000(0000) knlGS:0000000000000000
> <4>[ 354.473590] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[ 354.473599] CR2: 0000000000000250 CR3: 000000026976e000 CR4: 0000000000340ef0
Given the registers above, I think it means this - eax is global_seqno
of the node rq. rdx is is port_request so NULL and bang. No request in
port, but why would there always be one at the point we are scheduling
in a new request to the runnable queue?
Regards,
Tvrtko
> <4>[ 354.473611] Call Trace:
> <4>[ 354.473622] ? lock_acquire+0xa6/0x1c0
> <4>[ 354.473677] ? i915_schedule_bump_priority+0x57/0xd0 [i915]
> <4>[ 354.473736] i915_schedule_bump_priority+0x72/0xd0 [i915]
> <4>[ 354.473792] i915_request_wait+0x4db/0x840 [i915]
> <4>[ 354.473804] ? get_pwq.isra.4+0x2c/0x50
> <4>[ 354.473813] ? ___preempt_schedule+0x16/0x18
> <4>[ 354.473824] ? wake_up_q+0x70/0x70
> <4>[ 354.473831] ? wake_up_q+0x70/0x70
> <4>[ 354.473882] ? gen6_rps_boost+0x118/0x120 [i915]
> <4>[ 354.473936] i915_gem_object_wait_fence+0x8a/0x110 [i915]
> <4>[ 354.473991] i915_gem_object_wait+0x113/0x500 [i915]
> <4>[ 354.474047] i915_gem_wait_ioctl+0x11c/0x2f0 [i915]
> <4>[ 354.474101] ? i915_gem_unset_wedged+0x210/0x210 [i915]
> <4>[ 354.474113] drm_ioctl_kernel+0x81/0xf0
> <4>[ 354.474123] drm_ioctl+0x2de/0x390
> <4>[ 354.474175] ? i915_gem_unset_wedged+0x210/0x210 [i915]
> <4>[ 354.474187] ? finish_task_switch+0x95/0x260
> <4>[ 354.474197] ? lock_acquire+0xa6/0x1c0
> <4>[ 354.474207] do_vfs_ioctl+0xa0/0x6e0
> <4>[ 354.474217] ? __fget+0xfc/0x1e0
> <4>[ 354.474225] ksys_ioctl+0x35/0x60
> <4>[ 354.474233] __x64_sys_ioctl+0x11/0x20
> <4>[ 354.474241] do_syscall_64+0x55/0x190
> <4>[ 354.474251] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> <4>[ 354.474260] RIP: 0033:0x7f44b3de65d7
> <4>[ 354.474267] Code: b3 66 90 48 8b 05 b1 48 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 81 48 2d 00 f7 d8 64 89 01 48
> <4>[ 354.474293] RSP: 002b:00007fff974948e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> <4>[ 354.474305] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f44b3de65d7
> <4>[ 354.474316] RDX: 00007fff97494940 RSI: 00000000c010646c RDI: 0000000000000007
> <4>[ 354.474327] RBP: 00007fff97494940 R08: 0000000000000000 R09: 00007f44b40bbc40
> <4>[ 354.474337] R10: 0000000000000000 R11: 0000000000000246 R12: 00000000c010646c
> <4>[ 354.474348] R13: 0000000000000007 R14: 0000000000000000 R15: 0000000000000000
>
> v2: Avoid floating requests.
> v3: Can't call dma_fence_signal() under the timeline lock!
> v4: Can't call dma_fence_signal() from inside another fence either.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 54 +++++--------------------
> drivers/gpu/drm/i915/intel_lrc.c | 13 ++++--
> drivers/gpu/drm/i915/intel_ringbuffer.c | 13 +++++-
> 3 files changed, 31 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c55b1f75c980..834240a9b262 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3308,16 +3308,6 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> }
>
> static void nop_submit_request(struct i915_request *request)
> -{
> - GEM_TRACE("%s fence %llx:%d -> -EIO\n",
> - request->engine->name,
> - request->fence.context, request->fence.seqno);
> - dma_fence_set_error(&request->fence, -EIO);
> -
> - i915_request_submit(request);
> -}
> -
> -static void nop_complete_submit_request(struct i915_request *request)
> {
> unsigned long flags;
>
> @@ -3354,57 +3344,33 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
> * rolling the global seqno forward (since this would complete requests
> * for which we haven't set the fence error to EIO yet).
> */
> - for_each_engine(engine, i915, id) {
> + for_each_engine(engine, i915, id)
> i915_gem_reset_prepare_engine(engine);
>
> - engine->submit_request = nop_submit_request;
> - engine->schedule = NULL;
> - }
> - i915->caps.scheduler = 0;
> -
> /* Even if the GPU reset fails, it should still stop the engines */
> if (INTEL_GEN(i915) >= 5)
> intel_gpu_reset(i915, ALL_ENGINES);
>
> - /*
> - * Make sure no one is running the old callback before we proceed with
> - * cancelling requests and resetting the completion tracking. Otherwise
> - * we might submit a request to the hardware which never completes.
> - */
> - synchronize_rcu();
> -
> for_each_engine(engine, i915, id) {
> - /* Mark all executing requests as skipped */
> - engine->cancel_requests(engine);
> -
> - /*
> - * Only once we've force-cancelled all in-flight requests can we
> - * start to complete all requests.
> - */
> - engine->submit_request = nop_complete_submit_request;
> + engine->submit_request = nop_submit_request;
> + engine->schedule = NULL;
> }
> + i915->caps.scheduler = 0;
>
> /*
> * Make sure no request can slip through without getting completed by
> * either this call here to intel_engine_init_global_seqno, or the one
> - * in nop_complete_submit_request.
> + * in nop_submit_request.
> */
> synchronize_rcu();
>
> - for_each_engine(engine, i915, id) {
> - unsigned long flags;
> -
> - /*
> - * Mark all pending requests as complete so that any concurrent
> - * (lockless) lookup doesn't try and wait upon the request as we
> - * reset it.
> - */
> - spin_lock_irqsave(&engine->timeline.lock, flags);
> - intel_engine_init_global_seqno(engine,
> - intel_engine_last_submit(engine));
> - spin_unlock_irqrestore(&engine->timeline.lock, flags);
> + /* Mark all executing requests as skipped */
> + for_each_engine(engine, i915, id)
> + engine->cancel_requests(engine);
>
> + for_each_engine(engine, i915, id) {
> i915_gem_reset_finish_engine(engine);
> + intel_engine_wakeup(engine);
> }
>
> out:
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 11f4e6148557..b5511a054f30 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -818,8 +818,11 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> /* Mark all executing requests as skipped. */
> list_for_each_entry(rq, &engine->timeline.requests, link) {
> GEM_BUG_ON(!rq->global_seqno);
> - if (!i915_request_completed(rq))
> - dma_fence_set_error(&rq->fence, -EIO);
> +
> + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
> + continue;
> +
> + dma_fence_set_error(&rq->fence, -EIO);
> }
>
> /* Flush the queued requests to the timeline list (for retiring). */
> @@ -830,8 +833,8 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> priolist_for_each_request_consume(rq, rn, p, i) {
> list_del_init(&rq->sched.link);
>
> - dma_fence_set_error(&rq->fence, -EIO);
> __i915_request_submit(rq);
> + dma_fence_set_error(&rq->fence, -EIO);
> }
>
> rb_erase_cached(&p->node, &execlists->queue);
> @@ -839,6 +842,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> kmem_cache_free(engine->i915->priorities, p);
> }
>
> + intel_write_status_page(engine,
> + I915_GEM_HWS_INDEX,
> + intel_engine_last_submit(engine));
> +
> /* Remaining _unready_ requests will be nop'ed when submitted */
>
> execlists->queue_priority = INT_MIN;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 28ae1e436ea6..992889f9e0ff 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -748,9 +748,18 @@ static void cancel_requests(struct intel_engine_cs *engine)
> /* Mark all submitted requests as skipped. */
> list_for_each_entry(request, &engine->timeline.requests, link) {
> GEM_BUG_ON(!request->global_seqno);
> - if (!i915_request_completed(request))
> - dma_fence_set_error(&request->fence, -EIO);
> +
> + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> + &request->fence.flags))
> + continue;
> +
> + dma_fence_set_error(&request->fence, -EIO);
> }
> +
> + intel_write_status_page(engine,
> + I915_GEM_HWS_INDEX,
> + intel_engine_last_submit(engine));
> +
> /* Remaining _unready_ requests will be nop'ed when submitted */
>
> spin_unlock_irqrestore(&engine->timeline.lock, flags);
>
More information about the Intel-gfx
mailing list