[Intel-gfx] [PATCH] drm/i915: Preallocate mmu notifier to unbreak cpu hotplug deadlock

Thu Oct 5 13:51:13 UTC 2017

Quoting Daniel Vetter (2017-10-05 14:22:06)
> 4.14-rc1 gained the fancy new cross-release support in lockdep, which
> seems to have uncovered a few more rules about what is allowed and
> isn't.
> 
> This one here seems to indicate that allocating a work-queue while
> holding mmap_sem is a no-go, so let's try to preallocate it.
> 
> Of course another way to break this chain would be somewhere in the
> cpu hotplug code, since this isn't the only trace we're finding now
> which goes through msr_create_device.
> 
> Full lockdep splat:
> 
> ======================================================
> WARNING: possible circular locking dependency detected
> 4.14.0-rc3-CI-CI_DRM_3179+ #1 Tainted: G     U
> ------------------------------------------------------
> kworker/3:4/562 is trying to acquire lock:
>  (cpu_hotplug_lock.rw_sem){++++}, at: [<ffffffff8113d4bc>] stop_machine+0x1c/0x40
> 
> but task is already holding lock:
>  (&dev->struct_mutex){+.+.}, at: [<ffffffffa0136588>] i915_reset_device+0x1e8/0x260 [i915]
> 
> which lock already depends on the new lock.
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #6 (&dev->struct_mutex){+.+.}:
>        __lock_acquire+0x1420/0x15e0
>        lock_acquire+0xb0/0x200
>        __mutex_lock+0x86/0x9b0
>        mutex_lock_interruptible_nested+0x1b/0x20
>        i915_mutex_lock_interruptible+0x51/0x130 [i915]
>        i915_gem_fault+0x209/0x650 [i915]
>        __do_fault+0x1e/0x80
>        __handle_mm_fault+0xa08/0xed0
>        handle_mm_fault+0x156/0x300
>        __do_page_fault+0x2c5/0x570
>        do_page_fault+0x28/0x250
>        page_fault+0x22/0x30
> 
> -> #5 (&mm->mmap_sem){++++}:
>        __lock_acquire+0x1420/0x15e0
>        lock_acquire+0xb0/0x200
>        __might_fault+0x68/0x90
>        _copy_to_user+0x23/0x70
>        filldir+0xa5/0x120
>        dcache_readdir+0xf9/0x170
>        iterate_dir+0x69/0x1a0
>        SyS_getdents+0xa5/0x140
>        entry_SYSCALL_64_fastpath+0x1c/0xb1
> 
> -> #4 (&sb->s_type->i_mutex_key#5){++++}:
>        down_write+0x3b/0x70
>        handle_create+0xcb/0x1e0
>        devtmpfsd+0x139/0x180
>        kthread+0x152/0x190
>        ret_from_fork+0x27/0x40
> 
> -> #3 ((complete)&req.done){+.+.}:
>        __lock_acquire+0x1420/0x15e0
>        lock_acquire+0xb0/0x200
>        wait_for_common+0x58/0x210
>        wait_for_completion+0x1d/0x20
>        devtmpfs_create_node+0x13d/0x160
>        device_add+0x5eb/0x620
>        device_create_groups_vargs+0xe0/0xf0
>        device_create+0x3a/0x40
>        msr_device_create+0x2b/0x40
>        cpuhp_invoke_callback+0xc9/0xbf0
>        cpuhp_thread_fun+0x17b/0x240
>        smpboot_thread_fn+0x18a/0x280
>        kthread+0x152/0x190
>        ret_from_fork+0x27/0x40
> 
> -> #2 (cpuhp_state-up){+.+.}:
>        __lock_acquire+0x1420/0x15e0
>        lock_acquire+0xb0/0x200
>        cpuhp_issue_call+0x133/0x1c0
>        __cpuhp_setup_state_cpuslocked+0x139/0x2a0
>        __cpuhp_setup_state+0x46/0x60
>        page_writeback_init+0x43/0x67
>        pagecache_init+0x3d/0x42
>        start_kernel+0x3a8/0x3fc
>        x86_64_start_reservations+0x2a/0x2c
>        x86_64_start_kernel+0x6d/0x70
>        verify_cpu+0x0/0xfb
> 
> -> #1 (cpuhp_state_mutex){+.+.}:
>        __lock_acquire+0x1420/0x15e0
>        lock_acquire+0xb0/0x200
>        __mutex_lock+0x86/0x9b0
>        mutex_lock_nested+0x1b/0x20
>        __cpuhp_setup_state_cpuslocked+0x53/0x2a0
>        __cpuhp_setup_state+0x46/0x60
>        page_alloc_init+0x28/0x30
>        start_kernel+0x145/0x3fc
>        x86_64_start_reservations+0x2a/0x2c
>        x86_64_start_kernel+0x6d/0x70
>        verify_cpu+0x0/0xfb
> 
> -> #0 (cpu_hotplug_lock.rw_sem){++++}:
>        check_prev_add+0x430/0x840
>        __lock_acquire+0x1420/0x15e0
>        lock_acquire+0xb0/0x200
>        cpus_read_lock+0x3d/0xb0
>        stop_machine+0x1c/0x40
>        i915_gem_set_wedged+0x1a/0x20 [i915]
>        i915_reset+0xb9/0x230 [i915]
>        i915_reset_device+0x1f6/0x260 [i915]
>        i915_handle_error+0x2d8/0x430 [i915]
>        hangcheck_declare_hang+0xd3/0xf0 [i915]
>        i915_hangcheck_elapsed+0x262/0x2d0 [i915]
>        process_one_work+0x233/0x660
>        worker_thread+0x4e/0x3b0
>        kthread+0x152/0x190
>        ret_from_fork+0x27/0x40
> 
> other info that might help us debug this:
> 
> Chain exists of:
>   cpu_hotplug_lock.rw_sem --> &mm->mmap_sem --> &dev->struct_mutex
> 
>  Possible unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(&dev->struct_mutex);
>                                lock(&mm->mmap_sem);
>                                lock(&dev->struct_mutex);
>   lock(cpu_hotplug_lock.rw_sem);
> 
>  *** DEADLOCK ***
> 
> 3 locks held by kworker/3:4/562:
>  #0:  ("events_long"){+.+.}, at: [<ffffffff8109c64a>] process_one_work+0x1aa/0x660
>  #1:  ((&(&i915->gpu_error.hangcheck_work)->work)){+.+.}, at: [<ffffffff8109c64a>] process_one_work+0x1aa/0x660
>  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa0136588>] i915_reset_device+0x1e8/0x260 [i915]
> 
> stack backtrace:
> CPU: 3 PID: 562 Comm: kworker/3:4 Tainted: G     U          4.14.0-rc3-CI-CI_DRM_3179+ #1
> Hardware name:                  /NUC7i5BNB, BIOS BNKBL357.86A.0048.2017.0704.1415 07/04/2017
> Workqueue: events_long i915_hangcheck_elapsed [i915]
> Call Trace:
>  dump_stack+0x68/0x9f
>  print_circular_bug+0x235/0x3c0
>  ? lockdep_init_map_crosslock+0x20/0x20
>  check_prev_add+0x430/0x840
>  ? irq_work_queue+0x86/0xe0
>  ? wake_up_klogd+0x53/0x70
>  __lock_acquire+0x1420/0x15e0
>  ? __lock_acquire+0x1420/0x15e0
>  ? lockdep_init_map_crosslock+0x20/0x20
>  lock_acquire+0xb0/0x200
>  ? stop_machine+0x1c/0x40
>  ? i915_gem_object_truncate+0x50/0x50 [i915]
>  cpus_read_lock+0x3d/0xb0
>  ? stop_machine+0x1c/0x40
>  stop_machine+0x1c/0x40
>  i915_gem_set_wedged+0x1a/0x20 [i915]
>  i915_reset+0xb9/0x230 [i915]
>  i915_reset_device+0x1f6/0x260 [i915]
>  ? gen8_gt_irq_ack+0x170/0x170 [i915]
>  ? work_on_cpu_safe+0x60/0x60
>  i915_handle_error+0x2d8/0x430 [i915]
>  ? vsnprintf+0xd1/0x4b0
>  ? scnprintf+0x3a/0x70
>  hangcheck_declare_hang+0xd3/0xf0 [i915]
>  ? intel_runtime_pm_put+0x56/0xa0 [i915]
>  i915_hangcheck_elapsed+0x262/0x2d0 [i915]
>  process_one_work+0x233/0x660
>  worker_thread+0x4e/0x3b0
>  kthread+0x152/0x190
>  ? process_one_work+0x660/0x660
>  ? kthread_create_on_node+0x40/0x40
>  ret_from_fork+0x27/0x40
> Setting dangerous option reset - tainting kernel
> i915 0000:00:02.0: Resetting chip after gpu hang
> Setting dangerous option reset - tainting kernel
> i915 0000:00:02.0: Resetting chip after gpu hang
> 
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Thomas Gleixner <tglx at linutronix.de>
> Cc: Sasha Levin <alexander.levin at verizon.com>
> Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
> 
> --
> 
> Patch itself pretty much untested, I just want to figure out whether
> we should fix this (and similar backtraces going through
> msr_create_device) in i915, or whether the cpu hotplug folks will take
> care of them all.

Looking at the patch, we shrink the time under mmap_sem/mm_lock so seems
sensible (just from that pov).

> Afaict this is not because of changes in 4.14-rc1, but really only due
> to the new cross-release checks.
> 
> Note that the above trace is on top of -rc3 (plus plenty of drm/i915
> stuff), so should have Peter's recent lockdep fixes for cpu up vs.
> down already.
> -Daniel
> ---
>  drivers/gpu/drm/i915/i915_gem_userptr.c | 65 +++++++++++++++------------------
>  1 file changed, 29 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 2d4996de7331..afe166921572 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -160,36 +160,6 @@ static const struct mmu_notifier_ops i915_gem_userptr_notifier = {
>         .invalidate_range_start = i915_gem_userptr_mn_invalidate_range_start,
>  };
>  
> -static struct i915_mmu_notifier *
> -i915_mmu_notifier_create(struct mm_struct *mm)
> -{
> -       struct i915_mmu_notifier *mn;
> -       int ret;
> -
> -       mn = kmalloc(sizeof(*mn), GFP_KERNEL);
> -       if (mn == NULL)
> -               return ERR_PTR(-ENOMEM);
> -
> -       spin_lock_init(&mn->lock);
> -       mn->mn.ops = &i915_gem_userptr_notifier;
> -       mn->objects = RB_ROOT_CACHED;
> -       mn->wq = alloc_workqueue("i915-userptr-release", WQ_UNBOUND, 0);
> -       if (mn->wq == NULL) {
> -               kfree(mn);
> -               return ERR_PTR(-ENOMEM);
> -       }
> -
> -        /* Protected by mmap_sem (write-lock) */
> -       ret = __mmu_notifier_register(&mn->mn, mm);
> -       if (ret) {
> -               destroy_workqueue(mn->wq);
> -               kfree(mn);
> -               return ERR_PTR(ret);
> -       }
> -
> -       return mn;
> -}
> -
>  static void
>  i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
>  {
> @@ -210,23 +180,46 @@ i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
>  static struct i915_mmu_notifier *
>  i915_mmu_notifier_find(struct i915_mm_struct *mm)
>  {
> -       struct i915_mmu_notifier *mn = mm->mn;
> +       struct i915_mmu_notifier *mn = mm->mn, *free_mn;
> +       int ret;
>  
>         mn = mm->mn;
>         if (mn)
>                 return mn;
>  
> +       free_mn = kmalloc(sizeof(*free_mn), GFP_KERNEL);
> +       if (free_mn == NULL)
> +               return ERR_PTR(-ENOMEM);
> +
> +       spin_lock_init(&free_mn->lock);
> +       free_mn->mn.ops = &i915_gem_userptr_notifier;
> +       free_mn->objects = RB_ROOT_CACHED;
> +       /* must allocate wq outside of mm->mmap_sem */
> +       free_mn->wq = alloc_workqueue("i915-userptr-release", WQ_UNBOUND, 0);
> +       if (free_mn->wq == NULL) {
> +               ret = -ENOMEM;
> +               goto err_kfree;
> +       }
> +
>         down_write(&mm->mm->mmap_sem);
>         mutex_lock(&mm->i915->mm_lock);
> -       if ((mn = mm->mn) == NULL) {
> -               mn = i915_mmu_notifier_create(mm->mm);
> -               if (!IS_ERR(mn))
> -                       mm->mn = mn;
> +       if (mm->mn == NULL) {
> +                /* Protected by mmap_sem (write-lock) */
> +               ret = __mmu_notifier_register(&free_mn->mn, mm->mm);
> +               if (ret == 0)
> +                       mm->mn = mn =free_mn;
> +       } else {
> +               mn = mm->mn;
>         }
>         mutex_unlock(&mm->i915->mm_lock);
>         up_write(&mm->mm->mmap_sem);
>  
> -       return mn;
> +err_wq:
> +       destroy_workqueue(free_mn->wq);
> +err_kfree:
> +       kfree(free_mn);
> +
> +       return ret == 0 ? mn : ERR_PTR(ret);
>  }

I had a go,

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 2d4996de7331..f9b3406401af 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -164,7 +164,6 @@ static struct i915_mmu_notifier *
 i915_mmu_notifier_create(struct mm_struct *mm)
 {
        struct i915_mmu_notifier *mn;
-       int ret;
 
        mn = kmalloc(sizeof(*mn), GFP_KERNEL);
        if (mn == NULL)
@@ -179,14 +178,6 @@ i915_mmu_notifier_create(struct mm_struct *mm)
                return ERR_PTR(-ENOMEM);
        }
 
-        /* Protected by mmap_sem (write-lock) */
-       ret = __mmu_notifier_register(&mn->mn, mm);
-       if (ret) {
-               destroy_workqueue(mn->wq);
-               kfree(mn);
-               return ERR_PTR(ret);
-       }
-
        return mn;
 }
 
@@ -210,23 +201,37 @@ i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object *obj)
 static struct i915_mmu_notifier *
 i915_mmu_notifier_find(struct i915_mm_struct *mm)
 {
-       struct i915_mmu_notifier *mn = mm->mn;
+       struct i915_mmu_notifier *mn;
+       int err;
 
        mn = mm->mn;
        if (mn)
                return mn;
 
+       mn = i915_mmu_notifier_create(mm->mm);
+       if (IS_ERR(mn))
+               return mn;
+
+       err = 0;
        down_write(&mm->mm->mmap_sem);
        mutex_lock(&mm->i915->mm_lock);
-       if ((mn = mm->mn) == NULL) {
-               mn = i915_mmu_notifier_create(mm->mm);
-               if (!IS_ERR(mn))
-                       mm->mn = mn;
+       if (mm->mn == NULL) {
+               /* Protected by mmap_sem (write-lock) */
+               err = __mmu_notifier_register(&mn->mn, mm->mm);
+               if (!err) {
+                       /* Protected by mm_lock */
+                       mm->mn = fetch_and_zero(&mn);
+               }
        }
        mutex_unlock(&mm->i915->mm_lock);
        up_write(&mm->mm->mmap_sem);
 
-       return mn;
+       if (mn) {
+               destroy_workqueue(mn->wq);
+               kfree(mn);
+       }
+
+       return err ? ERR_PTR(err) : mm->mn;
 }

So not sold that we want to keep the separate single use
i915_mmu_notifier_create() (the onion unwind is pretty solid).

But it's safe enough to return mm->mn (it's stable once we've proved it's
set) and that allows us to avoid to a second free_mn. I'm prefer the
register above as I think it's that touch clearer.

In particular, I believe your use of free_mn is buggy as you still free
it after assigning it to mm->mn.
-Chris