[PATCH] drm/amdgpu: Fixed a potential circular lock

Felix Kuehling felix.kuehling at amd.com
Fri Sep 29 19:33:42 UTC 2017


Thanks Oak. The change is Reviewed-by: Felix Kuehling
<Felix.Kuehling at amd.com>


On 2017-09-29 10:28 AM, ozeng wrote:
> The dead circular lock senario captured is as followed.
> The idea of the fix is moving read_user_wptr outside of
> acquire_queue...release_queue critical section
>
> [   63.477482] WARNING: possible circular locking dependency detected
> [   63.484091] 4.12.0-kfd-ozeng #3 Not tainted
> [   63.488531] ------------------------------------------------------
> [   63.495146] HelloWorldLoop/2526 is trying to acquire lock:
> [   63.501011]  (&mm->mmap_sem){++++++}, at: [<ffffffff911898ce>] __might_fault+0x3e/0x90
> [   63.509472]
>                but task is already holding lock:
> [   63.515716]  (&adev->srbm_mutex){+.+...}, at: [<ffffffffc0484feb>] lock_srbm+0x2b/0x50 [amdgpu]
> [   63.525099]
>                which lock already depends on the new lock.
>
> [   63.533841]
>                the existing dependency chain (in reverse order) is:
> [   63.541839]
>                -> #2 (&adev->srbm_mutex){+.+...}:
> [   63.548178]        lock_acquire+0x6d/0x90
> [   63.552461]        __mutex_lock+0x70/0x8c0
> [   63.556826]        mutex_lock_nested+0x16/0x20
> [   63.561603]        gfx_v8_0_kiq_resume+0x1039/0x14a0 [amdgpu]
> [   63.567817]        gfx_v8_0_hw_init+0x204d/0x2210 [amdgpu]
> [   63.573675]        amdgpu_device_init+0xdea/0x1790 [amdgpu]
> [   63.579640]        amdgpu_driver_load_kms+0x63/0x220 [amdgpu]
> [   63.585743]        drm_dev_register+0x145/0x1e0
> [   63.590605]        amdgpu_pci_probe+0x11e/0x160 [amdgpu]
> [   63.596266]        local_pci_probe+0x40/0xa0
> [   63.600803]        pci_device_probe+0x134/0x150
> [   63.605650]        driver_probe_device+0x2a1/0x460
> [   63.610785]        __driver_attach+0xdc/0xe0
> [   63.615321]        bus_for_each_dev+0x5f/0x90
> [   63.619984]        driver_attach+0x19/0x20
> [   63.624337]        bus_add_driver+0x40/0x270
> [   63.628908]        driver_register+0x5b/0xe0
> [   63.633446]        __pci_register_driver+0x5b/0x60
> [   63.638586]        rtsx_pci_switch_output_voltage+0x1d/0x20 [rtsx_pci]
> [   63.645564]        do_one_initcall+0x4c/0x1b0
> [   63.650205]        do_init_module+0x56/0x1ea
> [   63.654767]        load_module+0x208c/0x27d0
> [   63.659335]        SYSC_finit_module+0x96/0xd0
> [   63.664058]        SyS_finit_module+0x9/0x10
> [   63.668629]        entry_SYSCALL_64_fastpath+0x1f/0xbe
> [   63.674088]
>                -> #1 (reservation_ww_class_mutex){+.+.+.}:
> [   63.681257]        lock_acquire+0x6d/0x90
> [   63.685551]        __ww_mutex_lock.constprop.11+0x8c/0xed0
> [   63.691426]        ww_mutex_lock+0x67/0x70
> [   63.695802]        amdgpu_verify_access+0x6d/0x100 [amdgpu]
> [   63.701743]        ttm_bo_mmap+0x8e/0x100 [ttm]
> [   63.706615]        amdgpu_bo_mmap+0xd/0x60 [amdgpu]
> [   63.711814]        amdgpu_mmap+0x35/0x40 [amdgpu]
> [   63.716904]        mmap_region+0x3b5/0x5a0
> [   63.721255]        do_mmap+0x400/0x4d0
> [   63.725260]        vm_mmap_pgoff+0xb0/0xf0
> [   63.729625]        SyS_mmap_pgoff+0x19e/0x260
> [   63.734292]        SyS_mmap+0x1d/0x20
> [   63.738199]        entry_SYSCALL_64_fastpath+0x1f/0xbe
> [   63.743681]
>                -> #0 (&mm->mmap_sem){++++++}:
> [   63.749641]        __lock_acquire+0x1401/0x1420
> [   63.754491]        lock_acquire+0x6d/0x90
> [   63.758750]        __might_fault+0x6b/0x90
> [   63.763176]        kgd_hqd_load+0x24f/0x270 [amdgpu]
> [   63.768432]        load_mqd+0x4b/0x50 [amdkfd]
> [   63.773192]        create_queue_nocpsch+0x535/0x620 [amdkfd]
> [   63.779237]        pqm_create_queue+0x34d/0x4f0 [amdkfd]
> [   63.784835]        kfd_ioctl_create_queue+0x282/0x670 [amdkfd]
> [   63.790973]        kfd_ioctl+0x310/0x4d0 [amdkfd]
> [   63.795944]        do_vfs_ioctl+0x90/0x6e0
> [   63.800268]        SyS_ioctl+0x74/0x80
> [   63.804207]        entry_SYSCALL_64_fastpath+0x1f/0xbe
> [   63.809607]
>                other info that might help us debug this:
>
> [   63.818026] Chain exists of:
>                  &mm->mmap_sem --> reservation_ww_class_mutex --> &adev->srbm_mutex
>
> [   63.830382]  Possible unsafe locking scenario:
>
> [   63.836605]        CPU0                    CPU1
> [   63.841364]        ----                    ----
> [   63.846123]   lock(&adev->srbm_mutex);
> [   63.850061]                                lock(reservation_ww_class_mutex);
> [   63.857475]                                lock(&adev->srbm_mutex);
> [   63.864084]   lock(&mm->mmap_sem);
> [   63.867657]
>                 *** DEADLOCK ***
>
> [   63.873884] 3 locks held by HelloWorldLoop/2526:
> [   63.878739]  #0:  (&process->mutex){+.+.+.}, at: [<ffffffffc06e1a9a>] kfd_ioctl_create_queue+0x24a/0x670 [amdkfd]
> [   63.889543]  #1:  (&dqm->lock){+.+...}, at: [<ffffffffc06eedeb>] create_queue_nocpsch+0x3b/0x620 [amdkfd]
> [   63.899684]  #2:  (&adev->srbm_mutex){+.+...}, at: [<ffffffffc0484feb>] lock_srbm+0x2b/0x50 [amdgpu]
> [   63.909500]
>                stack backtrace:
> [   63.914187] CPU: 3 PID: 2526 Comm: HelloWorldLoop Not tainted 4.12.0-kfd-ozeng #3
> [   63.922184] Hardware name: AMD Carrizo/Gardenia, BIOS WGA5819N_Weekly_15_08_1 08/19/2015
> [   63.930865] Call Trace:
> [   63.933464]  dump_stack+0x85/0xc9
> [   63.936999]  print_circular_bug+0x1f9/0x207
> [   63.941442]  __lock_acquire+0x1401/0x1420
> [   63.945745]  ? lock_srbm+0x2b/0x50 [amdgpu]
> [   63.950185]  lock_acquire+0x6d/0x90
> [   63.953885]  ? __might_fault+0x3e/0x90
> [   63.957899]  __might_fault+0x6b/0x90
> [   63.961699]  ? __might_fault+0x3e/0x90
> [   63.965755]  kgd_hqd_load+0x24f/0x270 [amdgpu]
> [   63.970577]  load_mqd+0x4b/0x50 [amdkfd]
> [   63.974745]  create_queue_nocpsch+0x535/0x620 [amdkfd]
> [   63.980242]  pqm_create_queue+0x34d/0x4f0 [amdkfd]
> [   63.985320]  kfd_ioctl_create_queue+0x282/0x670 [amdkfd]
> [   63.991021]  kfd_ioctl+0x310/0x4d0 [amdkfd]
> [   63.995499]  ? kfd_ioctl_destroy_queue+0x70/0x70 [amdkfd]
> [   64.001234]  do_vfs_ioctl+0x90/0x6e0
> [   64.005065]  ? up_read+0x1a/0x40
> [   64.008496]  SyS_ioctl+0x74/0x80
> [   64.011955]  entry_SYSCALL_64_fastpath+0x1f/0xbe
> [   64.016863] RIP: 0033:0x7f4b3bd35f07
> [   64.020696] RSP: 002b:00007ffe7689ec38 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [   64.028786] RAX: ffffffffffffffda RBX: 00000000002a2000 RCX: 00007f4b3bd35f07
> [   64.036414] RDX: 00007ffe7689ecb0 RSI: 00000000c0584b02 RDI: 0000000000000005
> [   64.044045] RBP: 00007f4a3212d000 R08: 00007f4b3c919000 R09: 0000000000080000
> [   64.051674] R10: 00007f4b376b64b8 R11: 0000000000000246 R12: 00007f4a3212d000
> [   64.059324] R13: 0000000000000015 R14: 0000000000000064 R15: 00007ffe7689ef50
>
> Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 10 +++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 10 +++++++++-
>  2 files changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index dc7e25c..47d1c13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -338,6 +338,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
>  	struct cik_mqd *m;
>  	uint32_t *mqd_hqd;
>  	uint32_t reg, wptr_val, data;
> +	bool valid_wptr = false;
>  
>  	m = get_mqd(mqd);
>  
> @@ -356,7 +357,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
>  			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
>  	WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data);
>  
> -	if (read_user_wptr(mm, wptr, wptr_val))
> +	/* read_user_ptr may take the mm->mmap_sem.
> +	 * release srbm_mutex to avoid circular dependency between
> +	 * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
> +	 */
> +	release_queue(kgd);
> +	valid_wptr = read_user_wptr(mm, wptr, wptr_val);
> +	acquire_queue(kgd, pipe_id, queue_id);
> +	if (valid_wptr)
>  		WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
>  
>  	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> index c678c69..056929b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> @@ -292,6 +292,7 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
>  	struct vi_mqd *m;
>  	uint32_t *mqd_hqd;
>  	uint32_t reg, wptr_val, data;
> +	bool valid_wptr = false;
>  
>  	m = get_mqd(mqd);
>  
> @@ -339,7 +340,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
>  			     CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
>  	WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, data);
>  
> -	if (read_user_wptr(mm, wptr, wptr_val))
> +	/* read_user_ptr may take the mm->mmap_sem.
> +	 * release srbm_mutex to avoid circular dependency between
> +	 * srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
> +	 */
> +	release_queue(kgd);
> +	valid_wptr = read_user_wptr(mm, wptr, wptr_val);
> +	acquire_queue(kgd, pipe_id, queue_id);
> +	if (valid_wptr)
>  		WREG32(mmCP_HQD_PQ_WPTR, (wptr_val << wptr_shift) & wptr_mask);
>  
>  	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);



More information about the amd-gfx mailing list