[PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

Felix Kuehling felix.kuehling at amd.com
Fri Dec 20 16:33:05 UTC 2019


dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs 
outside that lock protection. Therefore I opted to pass in the hanging 
flag as a parameter. It also keeps the logic that decides all of that 
inside the device queue manager, which I think is cleaner.

I was trying to clean this up further by moving the pm_init/pm_uninit 
out of the start_cpsch/stop_cpsch sequence, but gave up on that idea 
when I found out that I can't create the kernel queue in the DQM 
initialize function because dev->dqm isn't initialized at that time yet.

Regards,
   Felix

On 2019-12-20 10:56, shaoyunl wrote:
> Looks like patch 2 is not related to this serial , but anyway .
>
> Patch 1,2,3 are reviewed by shaoyunl  <shaoyun.liu at amd.com>
>
> For patch 4 ,  is it possible we directly check dqm->is_hws_hang || 
> dqm->is_resetting  inside function kq_uninitialize.  so we don't need 
> other interface change .
>
> I think even Inside that kq_uninitialize function , we still can get 
> dqm as  kq->dev->dqm .
>
>
> shaoyun.liu
>
>
> On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
>> Don't use the HWS if it's known to be hanging. In a reset also
>> don't try to destroy the HIQ because that may hang on SRIOV if the
>> KIQ is unresponsive.
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> ---
>>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>>   5 files changed, 17 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index a7e9ec1b3ce3..d7eb6ac37f62 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -946,7 +946,7 @@ static int start_nocpsch(struct 
>> device_queue_manager *dqm)
>>   static int stop_nocpsch(struct device_queue_manager *dqm)
>>   {
>>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>> -        pm_uninit(&dqm->packets);
>> +        pm_uninit(&dqm->packets, false);
>>       dqm->sched_running = false;
>>         return 0;
>> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct 
>> device_queue_manager *dqm)
>>       return 0;
>>   fail_allocate_vidmem:
>>   fail_set_sched_resources:
>> -    pm_uninit(&dqm->packets);
>> +    pm_uninit(&dqm->packets, false);
>>   fail_packet_manager_init:
>>       return retval;
>>   }
>>     static int stop_cpsch(struct device_queue_manager *dqm)
>>   {
>> +    bool hanging;
>> +kq_uninitialize(
>>
>>       dqm_lock(dqm);
>> -    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +    if (!dqm->is_hws_hang)
>> +        unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>> +    hanging = dqm->is_hws_hang || dqm->is_resetting;
>>       dqm->sched_running = false;
>>       dqm_unlock(dqm);
>>         kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>> -    pm_uninit(&dqm->packets);
>> +    pm_uninit(&dqm->packets, hanging);
>>         return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> index 2d56dc534459..bae706462f96 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue 
>> *kq, struct kfd_dev *dev,
>>   }
>>     /* Uninitialize a kernel queue and free all its memory usages. */
>> -static void kq_uninitialize(struct kernel_queue *kq)
>> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>>   {
>> -    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>> +    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>>           kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>>                       kq->queue->mqd,
>>                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct 
>> kfd_dev *dev,
>>       return NULL;
>>   }
>>   -void kernel_queue_uninit(struct kernel_queue *kq)
>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>>   {
>> -    kq_uninitialize(kq);
>> +    kq_uninitialize(kq, hanging);
>>       kfree(kq);
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> index 6cabed06ef5d..dc406e6dee23 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
>> device_queue_manager *dqm)
>>       return 0;
>>   }
>>   -void pm_uninit(struct packet_manager *pm)
>> +void pm_uninit(struct packet_manager *pm, bool hanging)
>>   {
>>       mutex_destroy(&pm->lock);
>> -    kernel_queue_uninit(pm->priv_queue);
>> +    kernel_queue_uninit(pm->priv_queue, hanging);
>>   }
>>     int pm_send_set_resources(struct packet_manager *pm,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 087e96838997..8ac680dc90f1 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -883,7 +883,7 @@ struct device_queue_manager 
>> *device_queue_manager_init(struct kfd_dev *dev);
>>   void device_queue_manager_uninit(struct device_queue_manager *dqm);
>>   struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>>                       enum kfd_queue_type type);
>> -void kernel_queue_uninit(struct kernel_queue *kq);
>> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned 
>> int pasid);
>>     /* Process Queue Manager */
>> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
>> kfd_vi_pm_funcs;
>>   extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>>     int pm_init(struct packet_manager *pm, struct 
>> device_queue_manager *dqm);
>> -void pm_uninit(struct packet_manager *pm);
>> +void pm_uninit(struct packet_manager *pm, bool hanging);
>>   int pm_send_set_resources(struct packet_manager *pm,
>>                   struct scheduling_resources *res);
>>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
>> *dqm_queues);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index d3eacf72e8db..8fa856e6a03f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct 
>> process_queue_manager *pqm, unsigned int qid)
>>           /* destroy kernel queue (DIQ) */
>>           dqm = pqn->kq->dev->dqm;
>>           dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>> -        kernel_queue_uninit(pqn->kq);
>> +        kernel_queue_uninit(pqn->kq, false);
>>       }
>>         if (pqn->q) {
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&reserved=0 
>


More information about the amd-gfx mailing list