[PATCH 5/6] drm/amdkfd: Fix a circular lock dependency
Kuehling, Felix
Felix.Kuehling at amd.com
Wed Jun 5 22:20:00 UTC 2019
This patch looks good to me, but it'll probably change a little if you
implement my suggestions for patch 4.
Regards,
Felix
On 2019-06-05 12:06 p.m., Zeng, Oak wrote:
> The idea to break the circular lock dependency is to move allocate_mqd
> out of dqm lock protection. See callstack #1 below.
>
> [ 59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0
>
> [ 513.604034] ======================================================
> [ 513.604205] WARNING: possible circular locking dependency detected
> [ 513.604375] 4.18.0-kfd-root #2 Tainted: G W
> [ 513.604530] ------------------------------------------------------
> [ 513.604699] kswapd0/611 is trying to acquire lock:
> [ 513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.605150]
> but task is already holding lock:
> [ 513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
> [ 513.605540]
> which lock already depends on the new lock.
>
> [ 513.605747]
> the existing dependency chain (in reverse order) is:
> [ 513.605944]
> -> #4 (&anon_vma->rwsem){++++}:
> [ 513.606106] __vma_adjust+0x147/0x7f0
> [ 513.606231] __split_vma+0x179/0x190
> [ 513.606353] mprotect_fixup+0x217/0x260
> [ 513.606553] do_mprotect_pkey+0x211/0x380
> [ 513.606752] __x64_sys_mprotect+0x1b/0x20
> [ 513.606954] do_syscall_64+0x50/0x1a0
> [ 513.607149] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [ 513.607380]
> -> #3 (&mapping->i_mmap_rwsem){++++}:
> [ 513.607678] rmap_walk_file+0x1f0/0x280
> [ 513.607887] page_referenced+0xdd/0x180
> [ 513.608081] shrink_page_list+0x853/0xcb0
> [ 513.608279] shrink_inactive_list+0x33b/0x700
> [ 513.608483] shrink_node_memcg+0x37a/0x7f0
> [ 513.608682] shrink_node+0xd8/0x490
> [ 513.608869] balance_pgdat+0x18b/0x3b0
> [ 513.609062] kswapd+0x203/0x5c0
> [ 513.609241] kthread+0x100/0x140
> [ 513.609420] ret_from_fork+0x24/0x30
> [ 513.609607]
> -> #2 (fs_reclaim){+.+.}:
> [ 513.609883] kmem_cache_alloc_trace+0x34/0x2e0
> [ 513.610093] reservation_object_reserve_shared+0x139/0x300
> [ 513.610326] ttm_bo_init_reserved+0x291/0x480 [ttm]
> [ 513.610567] amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
> [ 513.610811] amdgpu_bo_create+0x40/0x1f0 [amdgpu]
> [ 513.611041] amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
> [ 513.611290] amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
> [ 513.611584] amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
> [ 513.611823] gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
> [ 513.612491] amdgpu_device_init+0x14eb/0x1990 [amdgpu]
> [ 513.612730] amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
> [ 513.612958] drm_dev_register+0x111/0x1a0
> [ 513.613171] amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
> [ 513.613389] local_pci_probe+0x3f/0x90
> [ 513.613581] pci_device_probe+0x102/0x1c0
> [ 513.613779] driver_probe_device+0x2a7/0x480
> [ 513.613984] __driver_attach+0x10a/0x110
> [ 513.614179] bus_for_each_dev+0x67/0xc0
> [ 513.614372] bus_add_driver+0x1eb/0x260
> [ 513.614565] driver_register+0x5b/0xe0
> [ 513.614756] do_one_initcall+0xac/0x357
> [ 513.614952] do_init_module+0x5b/0x213
> [ 513.615145] load_module+0x2542/0x2d30
> [ 513.615337] __do_sys_finit_module+0xd2/0x100
> [ 513.615541] do_syscall_64+0x50/0x1a0
> [ 513.615731] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [ 513.615963]
> -> #1 (reservation_ww_class_mutex){+.+.}:
> [ 513.616293] amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
> [ 513.616554] init_mqd+0x223/0x260 [amdgpu]
> [ 513.616779] create_queue_nocpsch+0x4d9/0x600 [amdgpu]
> [ 513.617031] pqm_create_queue+0x37c/0x520 [amdgpu]
> [ 513.617270] kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
> [ 513.617522] kfd_ioctl+0x202/0x350 [amdgpu]
> [ 513.617724] do_vfs_ioctl+0x9f/0x6c0
> [ 513.617914] ksys_ioctl+0x66/0x70
> [ 513.618095] __x64_sys_ioctl+0x16/0x20
> [ 513.618286] do_syscall_64+0x50/0x1a0
> [ 513.618476] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [ 513.618695]
> -> #0 (&dqm->lock_hidden){+.+.}:
> [ 513.618984] __mutex_lock+0x98/0x970
> [ 513.619197] evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.619459] kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
> [ 513.619710] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
> [ 513.620103] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
> [ 513.620363] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
> [ 513.620614] __mmu_notifier_invalidate_range_start+0x70/0xb0
> [ 513.620851] try_to_unmap_one+0x7fc/0x8f0
> [ 513.621049] rmap_walk_anon+0x121/0x290
> [ 513.621242] try_to_unmap+0x93/0xf0
> [ 513.621428] shrink_page_list+0x606/0xcb0
> [ 513.621625] shrink_inactive_list+0x33b/0x700
> [ 513.621835] shrink_node_memcg+0x37a/0x7f0
> [ 513.622034] shrink_node+0xd8/0x490
> [ 513.622219] balance_pgdat+0x18b/0x3b0
> [ 513.622410] kswapd+0x203/0x5c0
> [ 513.622589] kthread+0x100/0x140
> [ 513.622769] ret_from_fork+0x24/0x30
> [ 513.622957]
> other info that might help us debug this:
>
> [ 513.623354] Chain exists of:
> &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem
>
> [ 513.623900] Possible unsafe locking scenario:
>
> [ 513.624189] CPU0 CPU1
> [ 513.624397] ---- ----
> [ 513.624594] lock(&anon_vma->rwsem);
> [ 513.624771] lock(&mapping->i_mmap_rwsem);
> [ 513.625020] lock(&anon_vma->rwsem);
> [ 513.625253] lock(&dqm->lock_hidden);
> [ 513.625433]
> *** DEADLOCK ***
>
> [ 513.625783] 3 locks held by kswapd0/611:
> [ 513.625967] #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
> [ 513.626309] #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
> [ 513.626671] #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0
> [ 513.627037]
> stack backtrace:
> [ 513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G W 4.18.0-kfd-root #2
> [ 513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
> [ 513.627990] Call Trace:
> [ 513.628143] dump_stack+0x7c/0xbb
> [ 513.628315] print_circular_bug.isra.37+0x21b/0x228
> [ 513.628581] __lock_acquire+0xf7d/0x1470
> [ 513.628782] ? unwind_next_frame+0x6c/0x4f0
> [ 513.628974] ? lock_acquire+0xec/0x1e0
> [ 513.629154] lock_acquire+0xec/0x1e0
> [ 513.629357] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.629587] __mutex_lock+0x98/0x970
> [ 513.629790] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.630047] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.630309] ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.630562] evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
> [ 513.630816] kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
> [ 513.631057] kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
> [ 513.631288] amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
> [ 513.631536] amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
> [ 513.632076] __mmu_notifier_invalidate_range_start+0x70/0xb0
> [ 513.632299] try_to_unmap_one+0x7fc/0x8f0
> [ 513.632487] ? page_lock_anon_vma_read+0x68/0x250
> [ 513.632690] rmap_walk_anon+0x121/0x290
> [ 513.632875] try_to_unmap+0x93/0xf0
> [ 513.633050] ? page_remove_rmap+0x330/0x330
> [ 513.633239] ? rcu_read_unlock+0x60/0x60
> [ 513.633422] ? page_get_anon_vma+0x160/0x160
> [ 513.633613] shrink_page_list+0x606/0xcb0
> [ 513.633800] shrink_inactive_list+0x33b/0x700
> [ 513.633997] shrink_node_memcg+0x37a/0x7f0
> [ 513.634186] ? shrink_node+0xd8/0x490
> [ 513.634363] shrink_node+0xd8/0x490
> [ 513.634537] balance_pgdat+0x18b/0x3b0
> [ 513.634718] kswapd+0x203/0x5c0
> [ 513.634887] ? wait_woken+0xb0/0xb0
> [ 513.635062] kthread+0x100/0x140
> [ 513.635231] ? balance_pgdat+0x3b0/0x3b0
> [ 513.635414] ? kthread_delayed_work_timer_fn+0x80/0x80
> [ 513.635626] ret_from_fork+0x24/0x30
> [ 513.636042] Evicting PASID 32768 queues
> [ 513.936236] Restoring PASID 32768 queues
> [ 524.708912] Evicting PASID 32768 queues
> [ 524.999875] Restoring PASID 32768 queues
>
> Change-Id: I334c8c9329be12e468ea7aabc878842ec003bd8e
> Signed-off-by: Oak Zeng <Oak.Zeng at amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 21 +++++++++++----------
> 1 file changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 787b5be..6b1a2ee 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -274,6 +274,12 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
>
> print_queue(q);
>
> + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> + q->properties.type)];
> + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
> + if (!q->mqd_mem_obj)
> + return -ENOMEM;
> +
> dqm_lock(dqm);
>
> if (dqm->total_queue_count >= max_num_of_queues_per_device) {
> @@ -299,8 +305,6 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> q->properties.tba_addr = qpd->tba_addr;
> q->properties.tma_addr = qpd->tma_addr;
>
> - mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> - q->properties.type)];
> if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
> retval = allocate_hqd(dqm, q);
> if (retval)
> @@ -319,13 +323,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> if (retval)
> goto out_deallocate_hqd;
>
> - q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
> - if (!q->mqd_mem_obj)
> - goto out_deallocate_doorbell;
> retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
> &q->gart_mqd_addr, &q->properties);
> if (retval)
> - goto out_uninit_mqd;
> + goto out_deallocate_doorbell;
>
> if (q->properties.is_active) {
>
> @@ -336,7 +337,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
> q->queue, &q->properties, current->mm);
> if (retval)
> - goto out_uninit_mqd;
> + goto out_deallocate_doorbell;
> }
>
> list_add(&q->list, &qpd->queues_list);
> @@ -356,10 +357,9 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> dqm->total_queue_count++;
> pr_debug("Total of %d queues are accountable so far\n",
> dqm->total_queue_count);
> - goto out_unlock;
> + dqm_unlock(dqm);
> + return retval;
>
> -out_uninit_mqd:
> - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> out_deallocate_doorbell:
> deallocate_doorbell(qpd, q);
> out_deallocate_hqd:
> @@ -373,6 +373,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
> deallocate_vmid(dqm, qpd, q);
> out_unlock:
> dqm_unlock(dqm);
> + mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> return retval;
> }
>
More information about the amd-gfx
mailing list