[PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

Deng, Emily Emily.Deng at amd.com
Fri Dec 20 10:03:24 UTC 2019


[AMD Official Use Only - Internal Distribution Only]

Series Tested-by:  Emily Deng <Emily.Deng at amd.com> on sriov environment with vege10 about TDR-1, TDR-2 and TDR-3 test cases.

Best wishes
Emily Deng



>-----Original Message-----
>From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Felix
>Kuehling
>Sent: Friday, December 20, 2019 4:30 PM
>To: amd-gfx at lists.freedesktop.org
>Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
>Don't use the HWS if it's known to be hanging. In a reset also don't try to
>destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
>Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
> drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
> .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
> 5 files changed, 17 insertions(+), 13 deletions(-)
>
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>index a7e9ec1b3ce3..d7eb6ac37f62 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager
>*dqm)  static int stop_nocpsch(struct device_queue_manager *dqm)  {
> 	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
>-		pm_uninit(&dqm->packets);
>+		pm_uninit(&dqm->packets, false);
> 	dqm->sched_running = false;
>
> 	return 0;
>@@ -1114,20 +1114,24 @@ static int start_cpsch(struct
>device_queue_manager *dqm)
> 	return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
>-	pm_uninit(&dqm->packets);
>+	pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> 	return retval;
> }
>
> static int stop_cpsch(struct device_queue_manager *dqm)  {
>+	bool hanging;
>+
> 	dqm_lock(dqm);
>-	unmap_queues_cpsch(dqm,
>KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>+	if (!dqm->is_hws_hang)
>+		unmap_queues_cpsch(dqm,
>KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>+	hanging = dqm->is_hws_hang || dqm->is_resetting;
> 	dqm->sched_running = false;
> 	dqm_unlock(dqm);
>
> 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>-	pm_uninit(&dqm->packets);
>+	pm_uninit(&dqm->packets, hanging);
>
> 	return 0;
> }
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>index 2d56dc534459..bae706462f96 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
>@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct
>kfd_dev *dev,  }
>
> /* Uninitialize a kernel queue and free all its memory usages. */ -static void
>kq_uninitialize(struct kernel_queue *kq)
>+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
> {
>-	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
>+	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
> 		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
> 					kq->queue->mqd,
>
>	KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
>@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev
>*dev,
> 	return NULL;
> }
>
>-void kernel_queue_uninit(struct kernel_queue *kq)
>+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
> {
>-	kq_uninitialize(kq);
>+	kq_uninitialize(kq, hanging);
> 	kfree(kq);
> }
>
>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>index 6cabed06ef5d..dc406e6dee23 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
>@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct
>device_queue_manager *dqm)
> 	return 0;
> }
>
>-void pm_uninit(struct packet_manager *pm)
>+void pm_uninit(struct packet_manager *pm, bool hanging)
> {
> 	mutex_destroy(&pm->lock);
>-	kernel_queue_uninit(pm->priv_queue);
>+	kernel_queue_uninit(pm->priv_queue, hanging);
> }
>
> int pm_send_set_resources(struct packet_manager *pm, diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>index 087e96838997..8ac680dc90f1 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>@@ -883,7 +883,7 @@ struct device_queue_manager
>*device_queue_manager_init(struct kfd_dev *dev);  void
>device_queue_manager_uninit(struct device_queue_manager *dqm);  struct
>kernel_queue *kernel_queue_init(struct kfd_dev *dev,
> 					enum kfd_queue_type type);
>-void kernel_queue_uninit(struct kernel_queue *kq);
>+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
> int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int
>pasid);
>
> /* Process Queue Manager */
>@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs
>kfd_vi_pm_funcs;  extern const struct packet_manager_funcs kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
>-void pm_uninit(struct packet_manager *pm);
>+void pm_uninit(struct packet_manager *pm, bool hanging);
> int pm_send_set_resources(struct packet_manager *pm,
> 				struct scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm, struct list_head
>*dqm_queues); diff --git
>a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>index d3eacf72e8db..8fa856e6a03f 100644
>--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
>process_queue_manager *pqm, unsigned int qid)
> 		/* destroy kernel queue (DIQ) */
> 		dqm = pqn->kq->dev->dqm;
> 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>-		kernel_queue_uninit(pqn->kq);
>+		kernel_queue_uninit(pqn->kq, false);
> 	}
>
> 	if (pqn->q) {
>--
>2.24.1
>
>_______________________________________________
>amd-gfx mailing list
>amd-gfx at lists.freedesktop.org
>https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.fre
>edesktop.org%2Fmailman%2Flistinfo%2Famd-
>gfx&data=02%7C01%7CEmily.Deng%40amd.com%7C3c77bba4d40d4bc6b
>e8508d78526dd45%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C
>637124274794842900&sdata=vHNAs2FTkSpHYZ2TTux%2F66attN4lf5qSiP
>jnlBOM5y0%3D&reserved=0


More information about the amd-gfx mailing list