[Intel-gfx] [PATCH] drm/i915: Avoid rcu_barrier() from reclaim paths (shrinker)
Daniel Vetter
daniel at ffwll.ch
Wed Mar 15 11:22:55 UTC 2017
On Tue, Mar 14, 2017 at 11:50:19AM +0000, Chris Wilson wrote:
> The rcu_barrier() takes the cpu_hotplug mutex which itself is not
> reclaim-safe, and so rcu_barrier() is illegal from inside the shrinker.
>
> [ 309.661373] =========================================================
> [ 309.661376] [ INFO: possible irq lock inversion dependency detected ]
> [ 309.661380] 4.11.0-rc1-CI-CI_DRM_2333+ #1 Tainted: G W
> [ 309.661383] ---------------------------------------------------------
> [ 309.661386] gem_exec_gttfil/6435 just changed the state of lock:
> [ 309.661389] (rcu_preempt_state.barrier_mutex){+.+.-.}, at: [<ffffffff81100731>] _rcu_barrier+0x31/0x160
> [ 309.661399] but this lock took another, RECLAIM_FS-unsafe lock in the past:
> [ 309.661402] (cpu_hotplug.lock){+.+.+.}
> [ 309.661404]
>
> and interrupts could create inverse lock ordering between them.
>
> [ 309.661410]
> other info that might help us debug this:
> [ 309.661414] Possible interrupt unsafe locking scenario:
>
> [ 309.661417] CPU0 CPU1
> [ 309.661419] ---- ----
> [ 309.661421] lock(cpu_hotplug.lock);
> [ 309.661425] local_irq_disable();
> [ 309.661432] lock(rcu_preempt_state.barrier_mutex);
> [ 309.661441] lock(cpu_hotplug.lock);
> [ 309.661446] <Interrupt>
> [ 309.661448] lock(rcu_preempt_state.barrier_mutex);
> [ 309.661453]
> *** DEADLOCK ***
>
> [ 309.661460] 4 locks held by gem_exec_gttfil/6435:
> [ 309.661464] #0: (sb_writers#10){.+.+.+}, at: [<ffffffff8120d83d>] vfs_write+0x17d/0x1f0
> [ 309.661475] #1: (debugfs_srcu){......}, at: [<ffffffff81320491>] debugfs_use_file_start+0x41/0xa0
> [ 309.661486] #2: (&attr->mutex){+.+.+.}, at: [<ffffffff8123a3e7>] simple_attr_write+0x37/0xe0
> [ 309.661495] #3: (&dev->struct_mutex){+.+.+.}, at: [<ffffffffa0091b4a>] i915_drop_caches_set+0x3a/0x150 [i915]
> [ 309.661540]
> the shortest dependencies between 2nd lock and 1st lock:
> [ 309.661547] -> (cpu_hotplug.lock){+.+.+.} ops: 829 {
> [ 309.661553] HARDIRQ-ON-W at:
> [ 309.661560] __lock_acquire+0x5e5/0x1b50
> [ 309.661565] lock_acquire+0xc9/0x220
> [ 309.661572] __mutex_lock+0x6e/0x990
> [ 309.661576] mutex_lock_nested+0x16/0x20
> [ 309.661583] get_online_cpus+0x61/0x80
> [ 309.661590] kmem_cache_create+0x25/0x1d0
> [ 309.661596] debug_objects_mem_init+0x30/0x249
> [ 309.661602] start_kernel+0x341/0x3fe
> [ 309.661607] x86_64_start_reservations+0x2a/0x2c
> [ 309.661612] x86_64_start_kernel+0x173/0x186
> [ 309.661619] verify_cpu+0x0/0xfc
> [ 309.661622] SOFTIRQ-ON-W at:
> [ 309.661627] __lock_acquire+0x611/0x1b50
> [ 309.661632] lock_acquire+0xc9/0x220
> [ 309.661636] __mutex_lock+0x6e/0x990
> [ 309.661641] mutex_lock_nested+0x16/0x20
> [ 309.661646] get_online_cpus+0x61/0x80
> [ 309.661650] kmem_cache_create+0x25/0x1d0
> [ 309.661655] debug_objects_mem_init+0x30/0x249
> [ 309.661660] start_kernel+0x341/0x3fe
> [ 309.661664] x86_64_start_reservations+0x2a/0x2c
> [ 309.661669] x86_64_start_kernel+0x173/0x186
> [ 309.661674] verify_cpu+0x0/0xfc
> [ 309.661677] RECLAIM_FS-ON-W at:
> [ 309.661682] mark_held_locks+0x6f/0xa0
> [ 309.661687] lockdep_trace_alloc+0xb3/0x100
> [ 309.661693] kmem_cache_alloc_trace+0x31/0x2e0
> [ 309.661699] __smpboot_create_thread.part.1+0x27/0xe0
> [ 309.661704] smpboot_create_threads+0x61/0x90
> [ 309.661709] cpuhp_invoke_callback+0x9c/0x8a0
> [ 309.661713] cpuhp_up_callbacks+0x31/0xb0
> [ 309.661718] _cpu_up+0x7a/0xc0
> [ 309.661723] do_cpu_up+0x5f/0x80
> [ 309.661727] cpu_up+0xe/0x10
> [ 309.661734] smp_init+0x71/0xb3
> [ 309.661738] kernel_init_freeable+0x94/0x19e
> [ 309.661743] kernel_init+0x9/0xf0
> [ 309.661748] ret_from_fork+0x2e/0x40
> [ 309.661752] INITIAL USE at:
> [ 309.661757] __lock_acquire+0x234/0x1b50
> [ 309.661761] lock_acquire+0xc9/0x220
> [ 309.661766] __mutex_lock+0x6e/0x990
> [ 309.661771] mutex_lock_nested+0x16/0x20
> [ 309.661775] get_online_cpus+0x61/0x80
> [ 309.661780] __cpuhp_setup_state+0x44/0x170
> [ 309.661785] page_alloc_init+0x23/0x3a
> [ 309.661790] start_kernel+0x124/0x3fe
> [ 309.661794] x86_64_start_reservations+0x2a/0x2c
> [ 309.661799] x86_64_start_kernel+0x173/0x186
> [ 309.661804] verify_cpu+0x0/0xfc
> [ 309.661807] }
> [ 309.661813] ... key at: [<ffffffff81e37690>] cpu_hotplug+0xb0/0x100
> [ 309.661817] ... acquired at:
> [ 309.661821] lock_acquire+0xc9/0x220
> [ 309.661825] __mutex_lock+0x6e/0x990
> [ 309.661829] mutex_lock_nested+0x16/0x20
> [ 309.661833] get_online_cpus+0x61/0x80
> [ 309.661837] _rcu_barrier+0x9f/0x160
> [ 309.661841] rcu_barrier+0x10/0x20
> [ 309.661847] netdev_run_todo+0x5f/0x310
> [ 309.661852] rtnl_unlock+0x9/0x10
> [ 309.661856] default_device_exit_batch+0x133/0x150
> [ 309.661862] ops_exit_list.isra.0+0x4d/0x60
> [ 309.661866] cleanup_net+0x1d8/0x2c0
> [ 309.661872] process_one_work+0x1f4/0x6d0
> [ 309.661876] worker_thread+0x49/0x4a0
> [ 309.661881] kthread+0x107/0x140
> [ 309.661884] ret_from_fork+0x2e/0x40
>
> [ 309.661890] -> (rcu_preempt_state.barrier_mutex){+.+.-.} ops: 179 {
> [ 309.661896] HARDIRQ-ON-W at:
> [ 309.661901] __lock_acquire+0x5e5/0x1b50
> [ 309.661905] lock_acquire+0xc9/0x220
> [ 309.661910] __mutex_lock+0x6e/0x990
> [ 309.661914] mutex_lock_nested+0x16/0x20
> [ 309.661919] _rcu_barrier+0x31/0x160
> [ 309.661923] rcu_barrier+0x10/0x20
> [ 309.661928] netdev_run_todo+0x5f/0x310
> [ 309.661932] rtnl_unlock+0x9/0x10
> [ 309.661936] default_device_exit_batch+0x133/0x150
> [ 309.661941] ops_exit_list.isra.0+0x4d/0x60
> [ 309.661946] cleanup_net+0x1d8/0x2c0
> [ 309.661951] process_one_work+0x1f4/0x6d0
> [ 309.661955] worker_thread+0x49/0x4a0
> [ 309.661960] kthread+0x107/0x140
> [ 309.661964] ret_from_fork+0x2e/0x40
> [ 309.661968] SOFTIRQ-ON-W at:
> [ 309.661972] __lock_acquire+0x611/0x1b50
> [ 309.661977] lock_acquire+0xc9/0x220
> [ 309.661981] __mutex_lock+0x6e/0x990
> [ 309.661986] mutex_lock_nested+0x16/0x20
> [ 309.661990] _rcu_barrier+0x31/0x160
> [ 309.661995] rcu_barrier+0x10/0x20
> [ 309.661999] netdev_run_todo+0x5f/0x310
> [ 309.662003] rtnl_unlock+0x9/0x10
> [ 309.662008] default_device_exit_batch+0x133/0x150
> [ 309.662013] ops_exit_list.isra.0+0x4d/0x60
> [ 309.662017] cleanup_net+0x1d8/0x2c0
> [ 309.662022] process_one_work+0x1f4/0x6d0
> [ 309.662027] worker_thread+0x49/0x4a0
> [ 309.662031] kthread+0x107/0x140
> [ 309.662035] ret_from_fork+0x2e/0x40
> [ 309.662039] IN-RECLAIM_FS-W at:
> [ 309.662043] __lock_acquire+0x638/0x1b50
> [ 309.662048] lock_acquire+0xc9/0x220
> [ 309.662053] __mutex_lock+0x6e/0x990
> [ 309.662058] mutex_lock_nested+0x16/0x20
> [ 309.662062] _rcu_barrier+0x31/0x160
> [ 309.662067] rcu_barrier+0x10/0x20
> [ 309.662089] i915_gem_shrink_all+0x33/0x40 [i915]
> [ 309.662109] i915_drop_caches_set+0x141/0x150 [i915]
> [ 309.662114] simple_attr_write+0xc7/0xe0
> [ 309.662119] full_proxy_write+0x4f/0x70
> [ 309.662124] __vfs_write+0x23/0x120
> [ 309.662128] vfs_write+0xc6/0x1f0
> [ 309.662133] SyS_write+0x44/0xb0
> [ 309.662138] entry_SYSCALL_64_fastpath+0x1c/0xb1
> [ 309.662142] INITIAL USE at:
> [ 309.662147] __lock_acquire+0x234/0x1b50
> [ 309.662151] lock_acquire+0xc9/0x220
> [ 309.662156] __mutex_lock+0x6e/0x990
> [ 309.662160] mutex_lock_nested+0x16/0x20
> [ 309.662165] _rcu_barrier+0x31/0x160
> [ 309.662169] rcu_barrier+0x10/0x20
> [ 309.662174] netdev_run_todo+0x5f/0x310
> [ 309.662178] rtnl_unlock+0x9/0x10
> [ 309.662183] default_device_exit_batch+0x133/0x150
> [ 309.662188] ops_exit_list.isra.0+0x4d/0x60
> [ 309.662192] cleanup_net+0x1d8/0x2c0
> [ 309.662197] process_one_work+0x1f4/0x6d0
> [ 309.662202] worker_thread+0x49/0x4a0
> [ 309.662206] kthread+0x107/0x140
> [ 309.662210] ret_from_fork+0x2e/0x40
> [ 309.662214] }
> [ 309.662220] ... key at: [<ffffffff81e4e1c8>] rcu_preempt_state+0x508/0x780
> [ 309.662225] ... acquired at:
> [ 309.662229] check_usage_forwards+0x12b/0x130
> [ 309.662233] mark_lock+0x360/0x6f0
> [ 309.662237] __lock_acquire+0x638/0x1b50
> [ 309.662241] lock_acquire+0xc9/0x220
> [ 309.662245] __mutex_lock+0x6e/0x990
> [ 309.662249] mutex_lock_nested+0x16/0x20
> [ 309.662253] _rcu_barrier+0x31/0x160
> [ 309.662257] rcu_barrier+0x10/0x20
> [ 309.662279] i915_gem_shrink_all+0x33/0x40 [i915]
> [ 309.662298] i915_drop_caches_set+0x141/0x150 [i915]
> [ 309.662303] simple_attr_write+0xc7/0xe0
> [ 309.662307] full_proxy_write+0x4f/0x70
> [ 309.662311] __vfs_write+0x23/0x120
> [ 309.662315] vfs_write+0xc6/0x1f0
> [ 309.662319] SyS_write+0x44/0xb0
> [ 309.662323] entry_SYSCALL_64_fastpath+0x1c/0xb1
>
> [ 309.662329]
> stack backtrace:
> [ 309.662335] CPU: 1 PID: 6435 Comm: gem_exec_gttfil Tainted: G W 4.11.0-rc1-CI-CI_DRM_2333+ #1
> [ 309.662342] Hardware name: Hewlett-Packard HP Compaq 8100 Elite SFF PC/304Ah, BIOS 786H1 v01.13 07/14/2011
> [ 309.662348] Call Trace:
> [ 309.662354] dump_stack+0x67/0x92
> [ 309.662359] print_irq_inversion_bug.part.19+0x1a4/0x1b0
> [ 309.662365] check_usage_forwards+0x12b/0x130
> [ 309.662369] mark_lock+0x360/0x6f0
> [ 309.662374] ? print_shortest_lock_dependencies+0x1a0/0x1a0
> [ 309.662379] __lock_acquire+0x638/0x1b50
> [ 309.662383] ? __mutex_unlock_slowpath+0x3e/0x2e0
> [ 309.662388] ? trace_hardirqs_on+0xd/0x10
> [ 309.662392] ? _rcu_barrier+0x31/0x160
> [ 309.662396] lock_acquire+0xc9/0x220
> [ 309.662400] ? _rcu_barrier+0x31/0x160
> [ 309.662404] ? _rcu_barrier+0x31/0x160
> [ 309.662409] __mutex_lock+0x6e/0x990
> [ 309.662412] ? _rcu_barrier+0x31/0x160
> [ 309.662416] ? _rcu_barrier+0x31/0x160
> [ 309.662421] ? synchronize_rcu_expedited+0x35/0xb0
> [ 309.662426] ? _raw_spin_unlock_irqrestore+0x52/0x60
> [ 309.662434] mutex_lock_nested+0x16/0x20
> [ 309.662438] _rcu_barrier+0x31/0x160
> [ 309.662442] rcu_barrier+0x10/0x20
> [ 309.662464] i915_gem_shrink_all+0x33/0x40 [i915]
> [ 309.662484] i915_drop_caches_set+0x141/0x150 [i915]
> [ 309.662489] simple_attr_write+0xc7/0xe0
> [ 309.662494] full_proxy_write+0x4f/0x70
> [ 309.662498] __vfs_write+0x23/0x120
> [ 309.662503] ? rcu_read_lock_sched_held+0x75/0x80
> [ 309.662507] ? rcu_sync_lockdep_assert+0x2a/0x50
> [ 309.662512] ? __sb_start_write+0x102/0x210
> [ 309.662516] ? vfs_write+0x17d/0x1f0
> [ 309.662520] vfs_write+0xc6/0x1f0
> [ 309.662524] ? trace_hardirqs_on_caller+0xe7/0x200
> [ 309.662529] SyS_write+0x44/0xb0
> [ 309.662533] entry_SYSCALL_64_fastpath+0x1c/0xb1
> [ 309.662537] RIP: 0033:0x7f507eac24a0
> [ 309.662541] RSP: 002b:00007fffda8720e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> [ 309.662548] RAX: ffffffffffffffda RBX: ffffffff81482bd3 RCX: 00007f507eac24a0
> [ 309.662552] RDX: 0000000000000005 RSI: 00007fffda8720f0 RDI: 0000000000000005
> [ 309.662557] RBP: ffffc9000048bf88 R08: 0000000000000000 R09: 000000000000002c
> [ 309.662561] R10: 0000000000000014 R11: 0000000000000246 R12: 00007fffda872230
> [ 309.662566] R13: 00007fffda872228 R14: 0000000000000201 R15: 00007fffda8720f0
> [ 309.662572] ? __this_cpu_preempt_check+0x13/0x20
>
> Fixes: 0eafec6d3244 ("drm/i915: Enable lockless lookup of request tracking via RCU")
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100192
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> Cc: <stable at vger.kernel.org> # v4.9+
Reviewed-by: Daniel Vetter <daniel.vetter at ffwll.ch>
> ---
> drivers/gpu/drm/i915/i915_gem_shrinker.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 006a8b908f77..4fda4d70d0f9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -266,7 +266,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv)
> I915_SHRINK_ACTIVE);
> intel_runtime_pm_put(dev_priv);
>
> - rcu_barrier(); /* wait until our RCU delayed slab frees are completed */
> + synchronize_rcu(); /* wait for our RCU delayed slab frees */
>
> return freed;
> }
> --
> 2.11.0
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
More information about the Intel-gfx
mailing list