[PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
shaoyunl
shaoyun.liu at amd.com
Fri Dec 20 19:31:50 UTC 2019
Can we use the dqm_lock when we try to get the dqm->is_hw_hang and
dqm->is_resetting inside function kq_uninitialize ?
I think more closer we check the status to hqd_destroy it will be more
accurate . It does look better with this logic if the status are changed
after dqm unmap_queue call and before we call hqd_destroy .
Another comment in line.
Regards
shaoyun.liu
On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
> dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs
> outside that lock protection. Therefore I opted to pass in the hanging
> flag as a parameter. It also keeps the logic that decides all of that
> inside the device queue manager, which I think is cleaner.
>
> I was trying to clean this up further by moving the pm_init/pm_uninit
> out of the start_cpsch/stop_cpsch sequence, but gave up on that idea
> when I found out that I can't create the kernel queue in the DQM
> initialize function because dev->dqm isn't initialized at that time yet.
>
> Regards,
> Felix
>
> On 2019-12-20 10:56, shaoyunl wrote:
>> Looks like patch 2 is not related to this serial , but anyway .
>>
>> Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu at amd.com>
>>
>> For patch 4 , is it possible we directly check dqm->is_hws_hang ||
>> dqm->is_resetting inside function kq_uninitialize. so we don't need
>> other interface change .
>>
>> I think even Inside that kq_uninitialize function , we still can get
>> dqm as kq->dev->dqm .
>>
>>
>> shaoyun.liu
>>
>>
>> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>>> Don't use the HWS if it's known to be hanging. In a reset also
>>> don't try to destroy the HIQ because that may hang on SRIOV if the
>>> KIQ is unresponsive.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>> ---
>>> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12
>>> ++++++++----
>>> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
>>> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
>>> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
>>> 5 files changed, 17 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct
>>> device_queue_manager *dqm)
>>> static int stop_nocpsch(struct device_queue_manager *dqm)
>>> {
>>> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>>> - pm_uninit(&dqm->packets);
>>> + pm_uninit(&dqm->packets, false);
>>> dqm->sched_running = false;
>>> return 0;
>>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>>> device_queue_manager *dqm)
>>> return 0;
>>> fail_allocate_vidmem:
>>> fail_set_sched_resources:
>>> - pm_uninit(&dqm->packets);
>>> + pm_uninit(&dqm->packets, false);
>>> fail_packet_manager_init:
>>> return retval;
>>> }
>>> static int stop_cpsch(struct device_queue_manager *dqm)
>>> {
>>> + bool hanging;
>>> +kq_uninitialize(
>>>
>>> dqm_lock(dqm);
>>> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>>> + if (!dqm->is_hws_hang)
[shaoyunl] should we check is_resetting here as well . so we ignore
the unmap call even HWS still not detect the hang but we know we
currently in resetting precedure
>>> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
>>> 0);
>>> + hanging = dqm->is_hws_hang || dqm->is_resetting;
>>> dqm->sched_running = false;
>>> dqm_unlock(dqm);
>>> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>>> - pm_uninit(&dqm->packets);
>>> + pm_uninit(&dqm->packets, hanging);
>>> return 0;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> index 2d56dc534459..bae706462f96 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue
>>> *kq, struct kfd_dev *dev,
>>> }
>>> /* Uninitialize a kernel queue and free all its memory usages. */
>>> -static void kq_uninitialize(struct kernel_queue *kq)
>>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>> {
>>> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>>> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>> kq->queue->mqd,
>>> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct
>>> kfd_dev *dev,
>>> return NULL;
>>> }
>>> -void kernel_queue_uninit(struct kernel_queue *kq)
>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>> {
>>> - kq_uninitialize(kq);
>>> + kq_uninitialize(kq, hanging);
>>> kfree(kq);
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> index 6cabed06ef5d..dc406e6dee23 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>>> device_queue_manager *dqm)
>>> return 0;
>>> }
>>> -void pm_uninit(struct packet_manager *pm)
>>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>> {
>>> mutex_destroy(&pm->lock);
>>> - kernel_queue_uninit(pm->priv_queue);
>>> + kernel_queue_uninit(pm->priv_queue, hanging);
>>> }
>>> int pm_send_set_resources(struct packet_manager *pm,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
>>> index 087e96838997..8ac680dc90f1 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -883,7 +883,7 @@ struct device_queue_manager
>>> *device_queue_manager_init(struct kfd_dev *dev);
>>> void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>> enum kfd_queue_type type);
>>> -void kernel_queue_uninit(struct kernel_queue *kq);
>>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>> int kfd_process_vm_fault(struct device_queue_manager *dqm,
>>> unsigned int pasid);
>>> /* Process Queue Manager */
>>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>>> kfd_vi_pm_funcs;
>>> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>> int pm_init(struct packet_manager *pm, struct
>>> device_queue_manager *dqm);
>>> -void pm_uninit(struct packet_manager *pm);
>>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>> int pm_send_set_resources(struct packet_manager *pm,
>>> struct scheduling_resources *res);
>>> int pm_send_runlist(struct packet_manager *pm, struct list_head
>>> *dqm_queues);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index d3eacf72e8db..8fa856e6a03f 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>>> process_queue_manager *pqm, unsigned int qid)
>>> /* destroy kernel queue (DIQ) */
>>> dqm = pqn->kq->dev->dqm;
>>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>> - kernel_queue_uninit(pqn->kq);
>>> + kernel_queue_uninit(pqn->kq, false);
>>> }
>>> if (pqn->q) {
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&reserved=0
>>
More information about the amd-gfx
mailing list