[PATCH 1/1] drm/amdkfd: Process notifier release callback don't take mutex

Felix Kuehling felix.kuehling at amd.com
Fri Jul 8 21:18:20 UTC 2022


I think this could also be fixed by not taking the process_info lock in 
svm_range_restore_work and svm_range_set_attr. I'm not even sure why 
we're taking this lock in the SVM code. I think that was copied from the 
restore workers in amdgpu_amdkfd_gpuvm.c because there it's used to 
protect the BO lists. But I think in the case of SVM, the equivalent 
lists are protected by the svms->lock.

Regards,
   Felix


On 2022-07-07 17:23, Philip Yang wrote:
> Move process queues cleanup to deferred work kfd_process_wq_release, to
> avoid potential deadlock circular locking warning:
>
>   WARNING: possible circular locking dependency detected
>                 the existing dependency chain (in reverse order) is:
>        -> #2
>          ((work_completion)(&svms->deferred_list_work)){+.+.}-{0:0}:
>          __flush_work+0x343/0x4a0
>          svm_range_list_lock_and_flush_work+0x39/0xc0
>          svm_range_set_attr+0xe8/0x1080 [amdgpu]
>          kfd_ioctl+0x19b/0x600 [amdgpu]
>          __x64_sys_ioctl+0x81/0xb0
>          do_syscall_64+0x34/0x80
>          entry_SYSCALL_64_after_hwframe+0x44/0xae
>
>        -> #1 (&info->lock#2){+.+.}-{3:3}:
>          __mutex_lock+0xa4/0x940
>          amdgpu_amdkfd_gpuvm_acquire_process_vm+0x2e3/0x590
>          kfd_process_device_init_vm+0x61/0x200 [amdgpu]
>          kfd_ioctl_acquire_vm+0x83/0xb0 [amdgpu]
>          kfd_ioctl+0x19b/0x600 [amdgpu]
>          __x64_sys_ioctl+0x81/0xb0
>          do_syscall_64+0x34/0x80
>         entry_SYSCALL_64_after_hwframe+0x44/0xae
>
>        -> #0 (&process->mutex){+.+.}-{3:3}:
>          __lock_acquire+0x1365/0x23d0
>          lock_acquire+0xc9/0x2e0
>          __mutex_lock+0xa4/0x940
>          kfd_process_notifier_release+0x96/0xe0 [amdgpu]
>          __mmu_notifier_release+0x94/0x210
>          exit_mmap+0x35/0x1f0
>          mmput+0x63/0x120
>          svm_range_deferred_list_work+0x177/0x2c0 [amdgpu]
>          process_one_work+0x2a4/0x600
>          worker_thread+0x39/0x3e0
>          kthread+0x16d/0x1a0
>
>    Possible unsafe locking scenario:
>
>        CPU0                    CPU1
>          ----                    ----
>     lock((work_completion)(&svms->deferred_list_work));
>                                  lock(&info->lock#2);
>               lock((work_completion)(&svms->deferred_list_work));
>     lock(&process->mutex);
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 21 +++++++++------------
>   1 file changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index fc38a4d81420..3c9cf9bdb63f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1115,6 +1115,15 @@ static void kfd_process_wq_release(struct work_struct *work)
>   	struct kfd_process *p = container_of(work, struct kfd_process,
>   					     release_work);
>   
> +	kfd_process_dequeue_from_all_devices(p);
> +	pqm_uninit(&p->pqm);
> +
> +	/* Signal the eviction fence after user mode queues are
> +	 * destroyed. This allows any BOs to be freed without
> +	 * triggering pointless evictions or waiting for fences.
> +	 */
> +	dma_fence_signal(p->ef);
> +
>   	kfd_process_remove_sysfs(p);
>   	kfd_iommu_unbind_process(p);
>   
> @@ -1179,20 +1188,8 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
>   	cancel_delayed_work_sync(&p->eviction_work);
>   	cancel_delayed_work_sync(&p->restore_work);
>   
> -	mutex_lock(&p->mutex);
> -
> -	kfd_process_dequeue_from_all_devices(p);
> -	pqm_uninit(&p->pqm);
> -
>   	/* Indicate to other users that MM is no longer valid */
>   	p->mm = NULL;
> -	/* Signal the eviction fence after user mode queues are
> -	 * destroyed. This allows any BOs to be freed without
> -	 * triggering pointless evictions or waiting for fences.
> -	 */
> -	dma_fence_signal(p->ef);
> -
> -	mutex_unlock(&p->mutex);
>   
>   	mmu_notifier_put(&p->mmu_notifier);
>   }


More information about the amd-gfx mailing list