[PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
shaoyunl
shaoyun.liu at amd.com
Fri Dec 20 15:56:26 UTC 2019
Looks like patch 2 is not related to this serial , but anyway .
Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu at amd.com>
For patch 4 , is it possible we directly check dqm->is_hws_hang ||
dqm->is_resetting inside function kq_uninitialize. so we don't need
other interface change .
I think even Inside that kq_uninitialize function , we still can get dqm
as kq->dev->dqm .
shaoyun.liu
On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
> Don't use the HWS if it's known to be hanging. In a reset also
> don't try to destroy the HIQ because that may hang on SRIOV if the
> KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
> static int stop_nocpsch(struct device_queue_manager *dqm)
> {
> if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> dqm->sched_running = false;
>
> return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
> return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm)
> {
> + bool hanging;
> +kq_uninitialize(
>
> dqm_lock(dqm);
> - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + if (!dqm->is_hws_hang)
> + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + hanging = dqm->is_hws_hang || dqm->is_resetting;
> dqm->sched_running = false;
> dqm_unlock(dqm);
>
> kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, hanging);
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
> }
>
> /* Uninitialize a kernel queue and free all its memory usages. */
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
> - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> kq->queue->mqd,
> KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> @@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> return NULL;
> }
>
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
> - kq_uninitialize(kq);
> + kq_uninitialize(kq, hanging);
> kfree(kq);
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
> return 0;
> }
>
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> mutex_destroy(&pm->lock);
> - kernel_queue_uninit(pm->priv_queue);
> + kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
> void device_queue_manager_uninit(struct device_queue_manager *dqm);
> struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
>
> /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
> extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
> -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> /* destroy kernel queue (DIQ) */
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> - kernel_queue_uninit(pqn->kq);
> + kernel_queue_uninit(pqn->kq, false);
> }
>
> if (pqn->q) {
More information about the amd-gfx
mailing list