[Intel-gfx] [CI 10/21] drm/i915: Mark up all locked waiters

Mika Kuoppala mika.kuoppala at linux.intel.com
Tue Sep 13 12:21:48 UTC 2016


Mika Kuoppala <mika.kuoppala at linux.intel.com> writes:

> "Zanoni, Paulo R" <paulo.r.zanoni at intel.com> writes:
>
>> Hi
>>
>> Em Sex, 2016-09-09 às 14:11 +0100, Chris Wilson escreveu:
>>> In the next patch we want to handle reset directly by a locked waiter
>>> in
>>> order to avoid issues with returning before the reset is handled. To
>>> handle the reset, we must first know whether we hold the
>>> struct_mutex.
>>> If we do not hold the struct_mtuex we can not perform the reset, but
>>> we do
>>> not block the reset worker either (and so we can just continue to
>>> wait for
>>> request completion) - otherwise we must relinquish the mutex.
>>> 
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Reviewed-by: Mika Kuoppala <mika.kuoppala at intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_debugfs.c      |  4 +++-
>>>  drivers/gpu/drm/i915/i915_gem.c          |  7 +++++--
>>>  drivers/gpu/drm/i915/i915_gem_evict.c    |  8 ++++++--
>>>  drivers/gpu/drm/i915/i915_gem_gtt.c      |  2 +-
>>>  drivers/gpu/drm/i915/i915_gem_request.c  | 15 ++++++++++++---
>>>  drivers/gpu/drm/i915/i915_gem_request.h  | 11 ++++++++---
>>>  drivers/gpu/drm/i915/i915_gem_shrinker.c |  2 +-
>>>  drivers/gpu/drm/i915/intel_ringbuffer.c  |  3 ++-
>>>  8 files changed, 38 insertions(+), 14 deletions(-)
>>> 
>>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
>>> b/drivers/gpu/drm/i915/i915_debugfs.c
>>> index 620e7daa133b..64702cc68e3a 100644
>>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>>> @@ -4803,7 +4803,9 @@ i915_drop_caches_set(void *data, u64 val)
>>>  		return ret;
>>>  
>>>  	if (val & DROP_ACTIVE) {
>>> -		ret = i915_gem_wait_for_idle(dev_priv,
>>> I915_WAIT_INTERRUPTIBLE);
>>> +		ret = i915_gem_wait_for_idle(dev_priv,
>>> +					     I915_WAIT_INTERRUPTIBLE
>>> |
>>> +					     I915_WAIT_LOCKED);
>>>  		if (ret)
>>>  			goto unlock;
>>>  	}
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c
>>> b/drivers/gpu/drm/i915/i915_gem.c
>>> index 4617250c3000..23069a2d2850 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -2802,7 +2802,8 @@ __i915_gem_object_sync(struct
>>> drm_i915_gem_request *to,
>>>  
>>>  	if (!i915.semaphores) {
>>>  		ret = i915_wait_request(from,
>>> -					from->i915-
>>> >mm.interruptible,
>>> +					from->i915->mm.interruptible 
>>> |
>>> +					I915_WAIT_LOCKED,
>>>  					NULL,
>>>  					NO_WAITBOOST);
>>>  		if (ret)
>>> @@ -4304,7 +4305,9 @@ int i915_gem_suspend(struct drm_device *dev)
>>>  	if (ret)
>>>  		goto err;
>>>  
>>> -	ret = i915_gem_wait_for_idle(dev_priv,
>>> I915_WAIT_INTERRUPTIBLE);
>>> +	ret = i915_gem_wait_for_idle(dev_priv,
>>> +				     I915_WAIT_INTERRUPTIBLE |
>>> +				     I915_WAIT_LOCKED);
>>>  	if (ret)
>>>  		goto err;
>>>  
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c
>>> b/drivers/gpu/drm/i915/i915_gem_evict.c
>>> index 103085246975..5b6f81c1dbca 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
>>> @@ -170,7 +170,9 @@ search_again:
>>>  	if (ret)
>>>  		return ret;
>>>  
>>> -	ret = i915_gem_wait_for_idle(dev_priv,
>>> I915_WAIT_INTERRUPTIBLE);
>>> +	ret = i915_gem_wait_for_idle(dev_priv,
>>> +				     I915_WAIT_INTERRUPTIBLE |
>>> +				     I915_WAIT_LOCKED);
>>>  	if (ret)
>>>  		return ret;
>>>  
>>> @@ -275,7 +277,9 @@ int i915_gem_evict_vm(struct i915_address_space
>>> *vm, bool do_idle)
>>>  				return ret;
>>>  		}
>>>  
>>> -		ret = i915_gem_wait_for_idle(dev_priv,
>>> I915_WAIT_INTERRUPTIBLE);
>>> +		ret = i915_gem_wait_for_idle(dev_priv,
>>> +					     I915_WAIT_INTERRUPTIBLE
>>> |
>>> +					     I915_WAIT_LOCKED);
>>>  		if (ret)
>>>  			return ret;
>>>  
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> index 9bcac52b8268..f3c6876da521 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>> @@ -2683,7 +2683,7 @@ void i915_gem_gtt_finish_object(struct
>>> drm_i915_gem_object *obj)
>>>  	struct i915_ggtt *ggtt = &dev_priv->ggtt;
>>>  
>>>  	if (unlikely(ggtt->do_idle_maps)) {
>>> -		if (i915_gem_wait_for_idle(dev_priv, 0)) {
>>> +		if (i915_gem_wait_for_idle(dev_priv,
>>> I915_WAIT_LOCKED)) {
>>>  			DRM_ERROR("Failed to wait for idle; VT'd may
>>> hang.\n");
>>>  			/* Wait a bit, in hopes it avoids the hang
>>> */
>>>  			udelay(10);
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c
>>> b/drivers/gpu/drm/i915/i915_gem_request.c
>>> index f4c15f319d08..5f89801e6a16 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
>>> @@ -260,7 +260,9 @@ static int i915_gem_init_seqno(struct
>>> drm_i915_private *dev_priv, u32 seqno)
>>>  
>>>  	/* Carefully retire all requests without writing to the
>>> rings */
>>>  	for_each_engine(engine, dev_priv) {
>>> -		ret = intel_engine_idle(engine,
>>> I915_WAIT_INTERRUPTIBLE);
>>> +		ret = intel_engine_idle(engine,
>>> +					I915_WAIT_INTERRUPTIBLE |
>>> +					I915_WAIT_LOCKED);
>>>  		if (ret)
>>>  			return ret;
>>>  	}
>>> @@ -625,6 +627,10 @@ int i915_wait_request(struct
>>> drm_i915_gem_request *req,
>>>  	int ret = 0;
>>>  
>>>  	might_sleep();
>>> +#if IS_ENABLED(CONFIG_LOCKDEP)
>>> +	GEM_BUG_ON(!!lockdep_is_held(&req->i915->drm.struct_mutex)
>>> !=
>>> +		   !!(flags & I915_WAIT_LOCKED));
>>> +#endif
>>
>> I'm hitting this on my SKL. It usually happens right after the login,
>> when I click the terminal icon on the toolbar in order to open it (on
>> Cinnamon). When it doesn't happen at that time, I just open a browser
>> and browse for like 30 seconds until it happens.
>>
>> I did manual reverts of this series up to this patch and obviously the
>> problem goes away (no more GEM_BUG_ON to hit). I didn't try to do a
>> real bisect to check if this is the first bad commit or if it's
>> something later on the series. It could even be an older problem that
>> just got exposed by the new BUG(). I'm available to test patches, just
>> send them to me.
>
> No patches yet but there is a comment on 
> intel_prepare_plane_fb() that says that it must be called with mutex
> held.
>
> Quite likely the new GEM_BUG_ON catches this not being true.
>

As Chris pointed out in irc, its about the reverse. Waiting
with mutex when you shouldn't.

-Mika


> -Mika
>
>>
>> Here's the backtrace:
>>
>> [   71.341272] ------------[ cut here ]------------
>> [   71.341305] kernel BUG at
>> drivers/gpu/drm/i915/i915_gem_request.c:763!
>> [   71.341331] invalid opcode: 0000 [#1] SMP
>> [   71.341348] Modules linked in: snd_hda_codec_hdmi
>> snd_hda_codec_realtek snd_hda_codec_generic snd_soc_skl snd_soc_skl_ipc
>> snd_soc_sst_match snd_hda_ext_core snd_soc_sst_dsp snd_soc_sst_ipc
>> nls_utf8 nls_cp437 vfat fat efi_pstore efivars iwlmvm sg snd_hda_intel
>> i915 iwlwifi snd_hda_codec snd_hwdep snd_hda_core drm_kms_helper idma64
>> virt_dma syscopyarea sysfillrect sysimgblt fb_sys_fops drm
>> intel_lpss_pci intel_pch_thermal wmi intel_lpss_acpi intel_lpss
>> intel_hid acpi_pad sparse_keymap sdhci_pci pinctrl_sunrisepoint
>> pinctrl_intel
>> [   71.341608] CPU: 3 PID: 3025 Comm: Xorg Tainted:
>> G     U          4.8.0-rc5pz+ #52
>> [   71.341637] Hardware name: Intel Corporation Skylake Client
>> platform/Skylake Y LPDDR3 RVP3, BIOS SKLSE2R1.86C.B074.R00.1502240607
>> 02/24/2015
>> [   71.341684] task: ffff880165006140 task.stack: ffff880164394000
>> [   71.341708] RIP: 0010:[<ffffffffa01d0377>]  [<ffffffffa01d0377>]
>> i915_wait_request+0x417/0x8e0 [i915]
>> [   71.341775] RSP: 0018:ffff880164397928  EFLAGS: 00010202
>> [   71.341795] RAX: 0000000000000000 RBX: 0000000000000000 RCX:
>> ffffffffffffffff
>> [   71.341821] RDX: 0000000000000401 RSI: ffff8801658900d0 RDI:
>> ffff8801650068d8
>> [   71.341849] RBP: ffff8801643979d8 R08: 0000000000000003 R09:
>> 0000000000000000
>> [   71.341875] R10: 0000000000000001 R11: 0000000000000000 R12:
>> 0000000000000001
>> [   71.341913] R13: 0000000000000000 R14: 0000000000000001 R15:
>> ffff880159b125c0
>> [   71.341940] FS:  00007fa4922eca40(0000) GS:ffff88016d4c0000(0000)
>> knlGS:0000000000000000
>> [   71.341970] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> [   71.341992] CR2: 0000560b99a07000 CR3: 00000001641eb000 CR4:
>> 00000000003406e0
>> [   71.342019] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>> 0000000000000000
>> [   71.342046] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
>> 0000000000000400
>> [   71.342073] Stack:
>> [   71.342083]  0000000000000046 0000000000000004 ffff880100000020
>> ffffffff827b40e0
>> [   71.342122]  ffffffffffffffff ffff880165006888 0000000000000001
>> 0000000000000000
>> [   71.342156]  ffff880165006140 ffffffff810d5560 ffff880164397978
>> ffff880164397978
>> [   71.342190] Call Trace:
>> [   71.342205]  [<ffffffff810d5560>] ? prepare_to_wait_event+0xf0/0xf0
>> [   71.342267]  [<ffffffffa01d0898>] i915_fence_wait+0x58/0xb0 [i915]
>> [   71.342294]  [<ffffffff8155e6cf>] fence_wait_timeout+0x8f/0x2a0
>> [   71.342317]  [<ffffffff815601cc>]
>> reservation_object_wait_timeout_rcu+0x1bc/0x570
>> [   71.342346]  [<ffffffff815600af>] ?
>> reservation_object_wait_timeout_rcu+0x9f/0x570
>> [   71.342399]  [<ffffffffa0218fca>] intel_prepare_plane_fb+0xca/0x2f0
>> [i915]
>> [   71.342426]  [<ffffffff81121247>] ? __module_address+0x27/0xf0
>> [   71.342455]  [<ffffffffa00e3521>]
>> drm_atomic_helper_prepare_planes+0x61/0xf0 [drm_kms_helper]
>> [   71.342510]  [<ffffffffa02115ae>] intel_atomic_commit+0x25e/0x4e0
>> [i915]
>> [   71.342549]  [<ffffffffa00929d2>] drm_atomic_commit+0x52/0x60 [drm]
>> [   71.342592]  [<ffffffffa00e4487>]
>> drm_atomic_helper_update_plane+0xe7/0x130 [drm_kms_helper]
>> [   71.342633]  [<ffffffffa0084e19>] __setplane_internal+0x1b9/0x2a0
>> [drm]
>> [   71.342667]  [<ffffffffa0085017>]
>> drm_mode_cursor_universal+0x117/0x200 [drm]
>> [   71.342703]  [<ffffffffa0085184>] drm_mode_cursor_common+0x84/0x180
>> [drm]
>> [   71.342738]  [<ffffffffa0086a99>] drm_mode_cursor2_ioctl+0x9/0x10
>> [drm]
>> [   71.342793]  [<ffffffffa007be49>] drm_ioctl+0x389/0x4e0 [drm]
>> [   71.342823]  [<ffffffffa0086a90>] ? drm_mode_cursor_ioctl+0x40/0x40
>> [drm]
>> [   71.342850]  [<ffffffff8124ea49>] ? mntput_no_expire+0x89/0x3c0
>> [   71.342873]  [<ffffffff8124e9c0>] ? mnt_get_count+0x60/0x60
>> [   71.342896]  [<ffffffff8123e81f>] do_vfs_ioctl+0x8f/0x690
>> [   71.342926]  [<ffffffff810ad784>] ? task_work_run+0x94/0xb0
>> [   71.342948]  [<ffffffff810df24d>] ?
>> trace_hardirqs_on_caller+0xed/0x1b0
>> [   71.342974]  [<ffffffff8123ee94>] SyS_ioctl+0x74/0x80
>> [   71.342995]  [<ffffffff8193f8e5>]
>> entry_SYSCALL_64_fastpath+0x18/0xa8
>> [   71.343019] Code: f0 e0 48 85 db 74 18 48 8b 03 48 8b 7b 08 48 83 c3
>> 18 4c 89 fe ff d0 48 8b 03 48 85 c0 75 eb 65 ff 0d 5e c1 e3 5f e9 3a fe
>> ff ff <0f> 0b 65 48 8b 04 25 40 c5 00 00 48 c7 80 a8 11 00 00 79 03 1d 
>> [   71.343195] RIP  [<ffffffffa01d0377>] i915_wait_request+0x417/0x8e0
>> [i915]
>> [   71.343246]  RSP <ffff880164397928>
>> [   71.366786] ---[ end trace 715dbae717d2187f ]---
>>
>> Remember that this makes the desktop 100% unusable.
>>
>> Thanks,
>> Paulo
>>
>>>  
>>>  	if (i915_gem_request_completed(req))
>>>  		return 0;
>>> @@ -667,7 +673,8 @@ int i915_wait_request(struct drm_i915_gem_request
>>> *req,
>>>  		goto complete;
>>>  
>>>  	set_current_state(state);
>>> -	add_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
>>> +	if (flags & I915_WAIT_LOCKED)
>>> +		add_wait_queue(&req->i915->gpu_error.wait_queue,
>>> &reset);
>>>  
>>>  	intel_wait_init(&wait, req->fence.seqno);
>>>  	if (intel_engine_add_wait(req->engine, &wait))
>>> @@ -707,10 +714,12 @@ wakeup:
>>>  		if (i915_spin_request(req, state, 2))
>>>  			break;
>>>  	}
>>> -	remove_wait_queue(&req->i915->gpu_error.wait_queue, &reset);
>>>  
>>>  	intel_engine_remove_wait(req->engine, &wait);
>>> +	if (flags & I915_WAIT_LOCKED)
>>> +		remove_wait_queue(&req->i915->gpu_error.wait_queue,
>>> &reset);
>>>  	__set_current_state(TASK_RUNNING);
>>> +
>>>  complete:
>>>  	trace_i915_gem_request_wait_end(req);
>>>  
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h
>>> b/drivers/gpu/drm/i915/i915_gem_request.h
>>> index 479896ef791e..def35721e9ed 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_request.h
>>> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
>>> @@ -222,7 +222,8 @@ int i915_wait_request(struct drm_i915_gem_request
>>> *req,
>>>  		      s64 *timeout,
>>>  		      struct intel_rps_client *rps)
>>>  	__attribute__((nonnull(1)));
>>> -#define I915_WAIT_INTERRUPTIBLE BIT(0)
>>> +#define I915_WAIT_INTERRUPTIBLE	BIT(0)
>>> +#define I915_WAIT_LOCKED	BIT(1) /* struct_mutex held, handle
>>> GPU reset */
>>>  
>>>  static inline u32 intel_engine_get_seqno(struct intel_engine_cs
>>> *engine);
>>>  
>>> @@ -576,7 +577,9 @@ i915_gem_active_wait(const struct i915_gem_active
>>> *active, struct mutex *mutex)
>>>  	if (!request)
>>>  		return 0;
>>>  
>>> -	return i915_wait_request(request, I915_WAIT_INTERRUPTIBLE,
>>> NULL, NULL);
>>> +	return i915_wait_request(request,
>>> +				 I915_WAIT_INTERRUPTIBLE |
>>> I915_WAIT_LOCKED,
>>> +				 NULL, NULL);
>>>  }
>>>  
>>>  /**
>>> @@ -639,7 +642,9 @@ i915_gem_active_retire(struct i915_gem_active
>>> *active,
>>>  	if (!request)
>>>  		return 0;
>>>  
>>> -	ret = i915_wait_request(request, I915_WAIT_INTERRUPTIBLE,
>>> NULL, NULL);
>>> +	ret = i915_wait_request(request,
>>> +				I915_WAIT_INTERRUPTIBLE |
>>> I915_WAIT_LOCKED,
>>> +				NULL, NULL);
>>>  	if (ret)
>>>  		return ret;
>>>  
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c
>>> b/drivers/gpu/drm/i915/i915_gem_shrinker.c
>>> index 35a05f4c51c1..1c237d02f30b 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
>>> @@ -414,7 +414,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb,
>>> unsigned long event, void *ptr
>>>  		return NOTIFY_DONE;
>>>  
>>>  	/* Force everything onto the inactive lists */
>>> -	ret = i915_gem_wait_for_idle(dev_priv, 0);
>>> +	ret = i915_gem_wait_for_idle(dev_priv, I915_WAIT_LOCKED);
>>>  	if (ret)
>>>  		goto out;
>>>  
>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> index a5bf18877068..8687a84a7ff3 100644
>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> @@ -2223,7 +2223,8 @@ static int wait_for_space(struct
>>> drm_i915_gem_request *req, int bytes)
>>>  	if (WARN_ON(&target->ring_link == &ring->request_list))
>>>  		return -ENOSPC;
>>>  
>>> -	ret = i915_wait_request(target, I915_WAIT_INTERRUPTIBLE,
>>> +	ret = i915_wait_request(target,
>>> +				I915_WAIT_INTERRUPTIBLE |
>>> I915_WAIT_LOCKED,
>>>  				NULL, NO_WAITBOOST);
>>>  	if (ret)
>>>  		return ret;
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx


More information about the Intel-gfx mailing list