[Intel-gfx] [CI 10/21] drm/i915: Mark up all locked waiters

Zanoni, Paulo R paulo.r.zanoni at intel.com
Mon Sep 12 19:17:42 UTC 2016


Hi

Em Sex, 2016-09-09 às 14:11 +0100, Chris Wilson escreveu:
> In the next patch we want to handle reset directly by a locked waiter
> in
> order to avoid issues with returning before the reset is handled. To
> handle the reset, we must first know whether we hold the
> struct_mutex.
> If we do not hold the struct_mtuex we can not perform the reset, but
> we do
> not block the reset worker either (and so we can just continue to
> wait for
> request completion) - otherwise we must relinquish the mutex.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c      |  4 +++-
>  drivers/gpu/drm/i915/i915_gem.c          |  7 +++++--
>  drivers/gpu/drm/i915/i915_gem_evict.c    |  8 ++++++--
>  drivers/gpu/drm/i915/i915_gem_gtt.c      |  2 +-
>  drivers/gpu/drm/i915/i915_gem_request.c  | 15 ++++++++++++---
>  drivers/gpu/drm/i915/i915_gem_request.h  | 11 ++++++++---
>  drivers/gpu/drm/i915/i915_gem_shrinker.c |  2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c  |  3 ++-
>  8 files changed, 38 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 620e7daa133b..64702cc68e3a 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4803,7 +4803,9 @@ i915_drop_caches_set(void *data, u64 val)
>  		return ret;
>  
>  	if (val & DROP_ACTIVE) {
> -		ret = i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_INTERRUPTIBLE);
> +		ret = i915_gem_wait_for_idle(dev_priv,
> +					     I915_WAIT_INTERRUPTIBLE
> |
> +					     I915_WAIT_LOCKED);
>  		if (ret)
>  			goto unlock;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem.c
> b/drivers/gpu/drm/i915/i915_gem.c
> index 4617250c3000..23069a2d2850 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2802,7 +2802,8 @@ __i915_gem_object_sync(struct
> drm_i915_gem_request *to,
>  
>  	if (!i915.semaphores) {
>  		ret = i915_wait_request(from,
> -					from->i915-
> >mm.interruptible,
> +					from->i915->mm.interruptible 
> |
> +					I915_WAIT_LOCKED,
>  					NULL,
>  					NO_WAITBOOST);
>  		if (ret)
> @@ -4304,7 +4305,9 @@ int i915_gem_suspend(struct drm_device *dev)
>  	if (ret)
>  		goto err;
>  
> -	ret = i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_INTERRUPTIBLE);
> +	ret = i915_gem_wait_for_idle(dev_priv,
> +				     I915_WAIT_INTERRUPTIBLE |
> +				     I915_WAIT_LOCKED);
>  	if (ret)
>  		goto err;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
> b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 103085246975..5b6f81c1dbca 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -170,7 +170,9 @@ search_again:
>  	if (ret)
>  		return ret;
>  
> -	ret = i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_INTERRUPTIBLE);
> +	ret = i915_gem_wait_for_idle(dev_priv,
> +				     I915_WAIT_INTERRUPTIBLE |
> +				     I915_WAIT_LOCKED);
>  	if (ret)
>  		return ret;
>  
> @@ -275,7 +277,9 @@ int i915_gem_evict_vm(struct i915_address_space
> *vm, bool do_idle)
>  				return ret;
>  		}
>  
> -		ret = i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_INTERRUPTIBLE);
> +		ret = i915_gem_wait_for_idle(dev_priv,
> +					     I915_WAIT_INTERRUPTIBLE
> |
> +					     I915_WAIT_LOCKED);
>  		if (ret)
>  			return ret;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 9bcac52b8268..f3c6876da521 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2683,7 +2683,7 @@ void i915_gem_gtt_finish_object(struct
> drm_i915_gem_object *obj)
>  	struct i915_ggtt *ggtt = &dev_priv->ggtt;
>  
>  	if (unlikely(ggtt->do_idle_maps)) {
> -		if (i915_gem_wait_for_idle(dev_priv, 0)) {
> +		if (i915_gem_wait_for_idle(dev_priv,
> I915_WAIT_LOCKED)) {
>  			DRM_ERROR("Failed to wait for idle; VT'd may
> hang.\n");
>  			/* Wait a bit, in hopes it avoids the hang
> */
>  			udelay(10);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c
> b/drivers/gpu/drm/i915/i915_gem_request.c
> index f4c15f319d08..5f89801e6a16 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -260,7 +260,9 @@ static int i915_gem_init_seqno(struct
> drm_i915_private *dev_priv, u32 seqno)
>  
>  	/* Carefully retire all requests without writing to the
> rings */
>  	for_each_engine(engine, dev_priv) {
> -		ret = intel_engine_idle(engine,
> I915_WAIT_INTERRUPTIBLE);
> +		ret = intel_engine_idle(engine,
> +					I915_WAIT_INTERRUPTIBLE |
> +					I915_WAIT_LOCKED);
>  		if (ret)
>  			return ret;
>  	}
> @@ -625,6 +627,10 @@ int i915_wait_request(struct
> drm_i915_gem_request *req,
>  	int ret = 0;
>  
>  	might_sleep();
> +#if IS_ENABLED(CONFIG_LOCKDEP)
> +	GEM_BUG_ON(!!lockdep_is_held(&req->i915->drm.struct_mutex)
> !=
> +		   !!(flags & I915_WAIT_LOCKED));
> +#endif

I'm hitting this on my SKL. It usually happens right after the login,
when I click the terminal icon on the toolbar in order to open it (on
Cinnamon). When it doesn't happen at that time, I just open a browser
and browse for like 30 seconds until it happens.

I did manual reverts of this series up to this patch and obviously the
problem goes away (no more GEM_BUG_ON to hit). I didn't try to do a
real bisect to check if this is the first bad commit or if it's
something later on the series. It could even be an older problem that
just got exposed by the new BUG(). I'm available to test patches, just
send them to me.

Here's the backtrace:

[   71.341272] ------------[ cut here ]------------
[   71.341305] kernel BUG at
drivers/gpu/drm/i915/i915_gem_request.c:763!
[   71.341331] invalid opcode: 0000 [#1] SMP
[   71.341348] Modules linked in: snd_hda_codec_hdmi
snd_hda_codec_realtek snd_hda_codec_generic snd_soc_skl snd_soc_skl_ipc
snd_soc_sst_match snd_hda_ext_core snd_soc_sst_dsp snd_soc_sst_ipc
nls_utf8 nls_cp437 vfat fat efi_pstore efivars iwlmvm sg snd_hda_intel
i915 iwlwifi snd_hda_codec snd_hwdep snd_hda_core drm_kms_helper idma64
virt_dma syscopyarea sysfillrect sysimgblt fb_sys_fops drm
intel_lpss_pci intel_pch_thermal wmi intel_lpss_acpi intel_lpss
intel_hid acpi_pad sparse_keymap sdhci_pci pinctrl_sunrisepoint
pinctrl_intel
[   71.341608] CPU: 3 PID: 3025 Comm: Xorg Tainted:
G     U          4.8.0-rc5pz+ #52
[   71.341637] Hardware name: Intel Corporation Skylake Client
platform/Skylake Y LPDDR3 RVP3, BIOS SKLSE2R1.86C.B074.R00.1502240607
02/24/2015
[   71.341684] task: ffff880165006140 task.stack: ffff880164394000
[   71.341708] RIP: 0010:[<ffffffffa01d0377>]  [<ffffffffa01d0377>]
i915_wait_request+0x417/0x8e0 [i915]
[   71.341775] RSP: 0018:ffff880164397928  EFLAGS: 00010202
[   71.341795] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
ffffffffffffffff
[   71.341821] RDX: 0000000000000401 RSI: ffff8801658900d0 RDI:
ffff8801650068d8
[   71.341849] RBP: ffff8801643979d8 R08: 0000000000000003 R09:
0000000000000000
[   71.341875] R10: 0000000000000001 R11: 0000000000000000 R12:
0000000000000001
[   71.341913] R13: 0000000000000000 R14: 0000000000000001 R15:
ffff880159b125c0
[   71.341940] FS:  00007fa4922eca40(0000) GS:ffff88016d4c0000(0000)
knlGS:0000000000000000
[   71.341970] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   71.341992] CR2: 0000560b99a07000 CR3: 00000001641eb000 CR4:
00000000003406e0
[   71.342019] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[   71.342046] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[   71.342073] Stack:
[   71.342083]  0000000000000046 0000000000000004 ffff880100000020
ffffffff827b40e0
[   71.342122]  ffffffffffffffff ffff880165006888 0000000000000001
0000000000000000
[   71.342156]  ffff880165006140 ffffffff810d5560 ffff880164397978
ffff880164397978
[   71.342190] Call Trace:
[   71.342205]  [<ffffffff810d5560>] ? prepare_to_wait_event+0xf0/0xf0
[   71.342267]  [<ffffffffa01d0898>] i915_fence_wait+0x58/0xb0 [i915]
[   71.342294]  [<ffffffff8155e6cf>] fence_wait_timeout+0x8f/0x2a0
[   71.342317]  [<ffffffff815601cc>]
reservation_object_wait_timeout_rcu+0x1bc/0x570
[   71.342346]  [<ffffffff815600af>] ?
reservation_object_wait_timeout_rcu+0x9f/0x570
[   71.342399]  [<ffffffffa0218fca>] intel_prepare_plane_fb+0xca/0x2f0
[i915]
[   71.342426]  [<ffffffff81121247>] ? __module_address+0x27/0xf0
[   71.342455]  [<ffffffffa00e3521>]
drm_atomic_helper_prepare_planes+0x61/0xf0 [drm_kms_helper]
[   71.342510]  [<ffffffffa02115ae>] intel_atomic_commit+0x25e/0x4e0
[i915]
[   71.342549]  [<ffffffffa00929d2>] drm_atomic_commit+0x52/0x60 [drm]
[   71.342592]  [<ffffffffa00e4487>]
drm_atomic_helper_update_plane+0xe7/0x130 [drm_kms_helper]
[   71.342633]  [<ffffffffa0084e19>] __setplane_internal+0x1b9/0x2a0
[drm]
[   71.342667]  [<ffffffffa0085017>]
drm_mode_cursor_universal+0x117/0x200 [drm]
[   71.342703]  [<ffffffffa0085184>] drm_mode_cursor_common+0x84/0x180
[drm]
[   71.342738]  [<ffffffffa0086a99>] drm_mode_cursor2_ioctl+0x9/0x10
[drm]
[   71.342793]  [<ffffffffa007be49>] drm_ioctl+0x389/0x4e0 [drm]
[   71.342823]  [<ffffffffa0086a90>] ? drm_mode_cursor_ioctl+0x40/0x40
[drm]
[   71.342850]  [<ffffffff8124ea49>] ? mntput_no_expire+0x89/0x3c0
[   71.342873]  [<ffffffff8124e9c0>] ? mnt_get_count+0x60/0x60
[   71.342896]  [<ffffffff8123e81f>] do_vfs_ioctl+0x8f/0x690
[   71.342926]  [<ffffffff810ad784>] ? task_work_run+0x94/0xb0
[   71.342948]  [<ffffffff810df24d>] ?
trace_hardirqs_on_caller+0xed/0x1b0
[   71.342974]  [<ffffffff8123ee94>] SyS_ioctl+0x74/0x80
[   71.342995]  [<ffffffff8193f8e5>]
entry_SYSCALL_64_fastpath+0x18/0xa8
[   71.343019] Code: f0 e0 48 85 db 74 18 48 8b 03 48 8b 7b 08 48 83 c3
18 4c 89 fe ff d0 48 8b 03 48 85 c0 75 eb 65 ff 0d 5e c1 e3 5f e9 3a fe
ff ff <0f> 0b 65 48 8b 04 25 40 c5 00 00 48 c7 80 a8 11 00 00 79 03 1d 
[   71.343195] RIP  [<ffffffffa01d0377>] i915_wait_request+0x417/0x8e0
[i915]
[   71.343246]  RSP <ffff880164397928>
[   71.366786] ---[ end trace 715dbae717d2187f ]---

Remember that this makes the desktop 100% unusable.

Thanks,
Paulo

>  
>  	if (i915_gem_request_completed(req))
>  		return 0;
> @@ -667,7 +673,8 @@ int i915_wait_request(struct drm_i915_gem_request
> *req,
>  		goto complete;
>  
>  	set_current_state(state);
> -	add_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
> +	if (flags & I915_WAIT_LOCKED)
> +		add_wait_queue(&req->i915->gpu_error.wait_queue,
> &reset);
>  
>  	intel_wait_init(&wait, req->fence.seqno);
>  	if (intel_engine_add_wait(req->engine, &wait))
> @@ -707,10 +714,12 @@ wakeup:
>  		if (i915_spin_request(req, state, 2))
>  			break;
>  	}
> -	remove_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
>  
>  	intel_engine_remove_wait(req->engine, &wait);
> +	if (flags & I915_WAIT_LOCKED)
> +		remove_wait_queue(&req->i915->gpu_error.wait_queue,
> &reset);
>  	__set_current_state(TASK_RUNNING);
> +
>  complete:
>  	trace_i915_gem_request_wait_end(req);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h
> b/drivers/gpu/drm/i915/i915_gem_request.h
> index 479896ef791e..def35721e9ed 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -222,7 +222,8 @@ int i915_wait_request(struct drm_i915_gem_request
> *req,
>  		      s64 *timeout,
>  		      struct intel_rps_client *rps)
>  	__attribute__((nonnull(1)));
> -#define I915_WAIT_INTERRUPTIBLE BIT(0)
> +#define I915_WAIT_INTERRUPTIBLE	BIT(0)
> +#define I915_WAIT_LOCKED	BIT(1) /* struct_mutex held, handle
> GPU reset */
>  
>  static inline u32 intel_engine_get_seqno(struct intel_engine_cs
> *engine);
>  
> @@ -576,7 +577,9 @@ i915_gem_active_wait(const struct i915_gem_active
> *active, struct mutex *mutex)
>  	if (!request)
>  		return 0;
>  
> -	return i915_wait_request(request, I915_WAIT_INTERRUPTIBLE,
> NULL, NULL);
> +	return i915_wait_request(request,
> +				 I915_WAIT_INTERRUPTIBLE |
> I915_WAIT_LOCKED,
> +				 NULL, NULL);
>  }
>  
>  /**
> @@ -639,7 +642,9 @@ i915_gem_active_retire(struct i915_gem_active
> *active,
>  	if (!request)
>  		return 0;
>  
> -	ret = i915_wait_request(request, I915_WAIT_INTERRUPTIBLE,
> NULL, NULL);
> +	ret = i915_wait_request(request,
> +				I915_WAIT_INTERRUPTIBLE |
> I915_WAIT_LOCKED,
> +				NULL, NULL);
>  	if (ret)
>  		return ret;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 35a05f4c51c1..1c237d02f30b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -414,7 +414,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb,
> unsigned long event, void *ptr
>  		return NOTIFY_DONE;
>  
>  	/* Force everything onto the inactive lists */
> -	ret = i915_gem_wait_for_idle(dev_priv, 0);
> +	ret = i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
>  	if (ret)
>  		goto out;
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index a5bf18877068..8687a84a7ff3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2223,7 +2223,8 @@ static int wait_for_space(struct
> drm_i915_gem_request *req, int bytes)
>  	if (WARN_ON(&target->ring_link == &ring->request_list))
>  		return -ENOSPC;
>  
> -	ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE,
> +	ret = i915_wait_request(target,
> +				I915_WAIT_INTERRUPTIBLE |
> I915_WAIT_LOCKED,
>  				NULL, NO_WAITBOOST);
>  	if (ret)
>  		return ret;


More information about the Intel-gfx mailing list