[PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset callback

Liu, Monk Monk.Liu at amd.com
Fri Dec 20 08:04:30 UTC 2019


Okay

Can you send me the patch (in attachment) once you finished it, I need to verify it on SRIOV 

Thanks 

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling at amd.com> 
Sent: Friday, December 20, 2019 3:56 PM
To: Liu, Monk <Monk.Liu at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Liu, Shaoyun <Shaoyun.Liu at amd.com>
Subject: Re: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset callback

You should be able to save the plain text email and pass that to "git am". It's trivial with Thunderbird on a Linux system. If you're using outlook, I'm not sure.

Anyway, I'm already reworking the patch based on Shaoyun's suggestion and some ideas it gave me.

Regards,
   Felix

On 2019-12-19 23:23, Liu, Monk wrote:
> Hi Felix
>
> Do you know how I can get a "xxx.patch" file from the email from you ??
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling at amd.com>
> Sent: Friday, December 20, 2019 10:09 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Liu, Shaoyun <Shaoyun.Liu at amd.com>; Liu, Monk <Monk.Liu at amd.com>
> Subject: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset 
> callback
>
> The intention of the pre_reset callback is to update the driver state to reflect that all user mode queues are preempted and the HIQ is destroyed. However we should not actually preempt any queues or otherwise touch the hardware because it is presumably hanging.
> The impending reset will take care of actually stopping all queues.
>
> This should prevent KIQ register access hanging on SRIOV function level reset (FLR). It should also speed up the reset by avoiding unnecessary timeouts from a potentially hanging GPU scheduler.
>
> CC: shaoyunl <shaoyun.liu at amd.com>
> CC: Liu Monk <Monk.Liu at amd.com>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 24 ++++++++++-------
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++-------  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  5 ++--  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  8 +++---
>   .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  4 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  8 +++---
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 11 ++++----
>   .../amd/amdkfd/kfd_process_queue_manager.c    |  2 +-
>   8 files changed, 53 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c6b6901bbda3..796996a0d926 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -486,6 +486,7 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
>   				unsigned int chunk_size);
>   static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
>   
> +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset);
>   static int kfd_resume(struct kfd_dev *kfd);
>   
>   struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, @@ -728,7 +729,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
>   	if (!kfd->init_complete)
>   		return 0;
> -	kgd2kfd_suspend(kfd);
> +	kfd_suspend(kfd, true);
>   
>   	kfd_signal_reset_event(kfd);
>   	return 0;
> @@ -767,13 +768,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
>   	if (!kfd->init_complete)
>   		return;
>   
> -	/* For first KFD device suspend all the KFD processes */
> -	if (atomic_inc_return(&kfd_locked) == 1)
> -		kfd_suspend_all_processes();
> -
> -	kfd->dqm->ops.stop(kfd->dqm);
> -
> -	kfd_iommu_suspend(kfd);
> +	kfd_suspend(kfd, false);
>   }
>   
>   int kgd2kfd_resume(struct kfd_dev *kfd) @@ -795,6 +790,17 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
>   	return ret;
>   }
>   
> +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset) {
> +	/* For first KFD device suspend all the KFD processes */
> +	if (atomic_inc_return(&kfd_locked) == 1)
> +		kfd_suspend_all_processes(pre_reset);
> +
> +	kfd->dqm->ops.stop(kfd->dqm, pre_reset);
> +
> +	kfd_iommu_suspend(kfd);
> +}
> +
>   static int kfd_resume(struct kfd_dev *kfd)  {
>   	int err = 0;
> @@ -877,7 +883,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>   	if (!p)
>   		return -ESRCH;
>   
> -	r = kfd_process_evict_queues(p);
> +	r = kfd_process_evict_queues(p, false);
>   
>   	kfd_unref_process(p);
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f7f6df40875e..3a49944164da 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -592,7 +592,8 @@ static int update_queue(struct 
> device_queue_manager *dqm, struct queue *q)  }
>   
>   static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
> -					struct qcm_process_device *qpd)
> +					struct qcm_process_device *qpd,
> +					bool pre_reset)
>   {
>   	struct queue *q;
>   	struct mqd_manager *mqd_mgr;
> @@ -622,6 +623,8 @@ static int evict_process_queues_nocpsch(struct 
> device_queue_manager *dqm,
>   
>   		if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
>   			continue;
> +		if (pre_reset)
> +			continue;
>   
>   		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>   				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, @@ -639,7 +642,8 @@ static int 
> evict_process_queues_nocpsch(struct device_queue_manager *dqm,  }
>   
>   static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
> -				      struct qcm_process_device *qpd)
> +				      struct qcm_process_device *qpd,
> +				      bool pre_reset)
>   {
>   	struct queue *q;
>   	struct kfd_process_device *pdd;
> @@ -664,6 +668,10 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   		q->properties.is_active = false;
>   		dqm->queue_count--;
>   	}
> +
> +	if (pre_reset)
> +		goto out;
> +
>   	retval = execute_queues_cpsch(dqm,
>   				qpd->is_debug ?
>   				KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> @@ -944,10 +952,10 @@ static int start_nocpsch(struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> -static int stop_nocpsch(struct device_queue_manager *dqm)
> +static int stop_nocpsch(struct device_queue_manager *dqm, bool
> +pre_reset)
>   {
>   	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -		pm_uninit(&dqm->packets);
> +		pm_uninit(&dqm->packets, pre_reset);
>   	dqm->sched_running = false;
>   
>   	return 0;
> @@ -1107,20 +1115,21 @@ static int start_cpsch(struct device_queue_manager *dqm)
>   	return 0;
>   fail_allocate_vidmem:
>   fail_set_sched_resources:
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, false);
>   fail_packet_manager_init:
>   	return retval;
>   }
>   
> -static int stop_cpsch(struct device_queue_manager *dqm)
> +static int stop_cpsch(struct device_queue_manager *dqm, bool 
> +pre_reset)
>   {
>   	dqm_lock(dqm);
> -	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +	if (!pre_reset)
> +		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>   	dqm->sched_running = false;
>   	dqm_unlock(dqm);
>   
>   	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> -	pm_uninit(&dqm->packets);
> +	pm_uninit(&dqm->packets, pre_reset);
>   
>   	return 0;
>   }
> @@ -1891,7 +1900,7 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
>   		return -EINVAL;
>   	pdd = kfd_get_process_device_data(dqm->dev, p);
>   	if (pdd)
> -		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> +		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd, false);
>   	kfd_unref_process(p);
>   
>   	return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index a8c37e6da027..9f82f95f6a92 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -103,7 +103,7 @@ struct device_queue_manager_ops {
>   
>   	int	(*initialize)(struct device_queue_manager *dqm);
>   	int	(*start)(struct device_queue_manager *dqm);
> -	int	(*stop)(struct device_queue_manager *dqm);
> +	int	(*stop)(struct device_queue_manager *dqm, bool pre_reset);
>   	void	(*uninitialize)(struct device_queue_manager *dqm);
>   	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
>   					struct kernel_queue *kq,
> @@ -129,7 +129,8 @@ struct device_queue_manager_ops {
>   			struct qcm_process_device *qpd);
>   
>   	int (*evict_process_queues)(struct device_queue_manager *dqm,
> -				    struct qcm_process_device *qpd);
> +				    struct qcm_process_device *qpd,
> +				    bool pre_reset);
>   	int (*restore_process_queues)(struct device_queue_manager *dqm,
>   				      struct qcm_process_device *qpd);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..dbd2e8e9ae69 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, 
> struct kfd_dev *dev,  }
>   
>   /* Uninitialize a kernel queue and free all its memory usages. */ 
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool pre_reset)
>   {
> -	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> +	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !pre_reset)
>   		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>   					kq->queue->mqd,
>   					KFD_PREEMPT_TYPE_WAVEFRONT_RESET, @@ -337,9 +337,9 @@ struct 
> kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   	return NULL;
>   }
>   
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset)
>   {
> -	kq_uninitialize(kq);
> +	kq_uninitialize(kq, pre_reset);
>   	kfree(kq);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..7732a3bbebd6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
>   	return 0;
>   }
>   
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool pre_reset)
>   {
>   	mutex_destroy(&pm->lock);
> -	kernel_queue_uninit(pm->priv_queue);
> +	kernel_queue_uninit(pm->priv_queue, pre_reset);
>   }
>   
>   int pm_send_set_resources(struct packet_manager *pm, diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..9bc4ced4acba 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -762,9 +762,9 @@ struct kfd_process *kfd_get_process(const struct 
> task_struct *);  struct kfd_process 
> *kfd_lookup_process_by_pasid(unsigned int pasid);  struct kfd_process 
> *kfd_lookup_process_by_mm(const struct mm_struct *mm);  void 
> kfd_unref_process(struct kfd_process *p); -int 
> kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset);
>   int kfd_process_restore_queues(struct kfd_process *p); -void 
> kfd_suspend_all_processes(void);
> +void kfd_suspend_all_processes(bool pre_reset);
>   int kfd_resume_all_processes(void);
>   
>   int kfd_process_device_init_vm(struct kfd_process_device *pdd, @@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);  void device_queue_manager_uninit(struct device_queue_manager *dqm);  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>   					enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset);
>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned 
> int pasid);
>   
>   /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
> kfd_vi_pm_funcs;  extern const struct packet_manager_funcs 
> kfd_v9_pm_funcs;
>   
>   int pm_init(struct packet_manager *pm, struct device_queue_manager 
> *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool pre_reset);
>   int pm_send_set_resources(struct packet_manager *pm,
>   				struct scheduling_resources *res);
>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
> *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 536a153ac9a4..7bcadd3a1e3b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -948,7 +948,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
>    * Eviction is reference-counted per process-device. This means multiple
>    * evictions from different sources can be nested safely.
>    */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset)
>   {
>   	struct kfd_process_device *pdd;
>   	int r = 0;
> @@ -956,7 +956,8 @@ int kfd_process_evict_queues(struct kfd_process 
> *p)
>   
>   	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
>   		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
> -							    &pdd->qpd);
> +							    &pdd->qpd,
> +							    pre_reset);
>   		if (r) {
>   			pr_err("Failed to evict process queues\n");
>   			goto fail;
> @@ -1026,7 +1027,7 @@ static void evict_process_worker(struct work_struct *work)
>   	flush_delayed_work(&p->restore_work);
>   
>   	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, false);
>   	if (!ret) {
>   		dma_fence_signal(p->ef);
>   		dma_fence_put(p->ef);
> @@ -1082,7 +1083,7 @@ static void restore_process_worker(struct work_struct *work)
>   		pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);  }
>   
> -void kfd_suspend_all_processes(void)
> +void kfd_suspend_all_processes(bool pre_reset)
>   {
>   	struct kfd_process *p;
>   	unsigned int temp;
> @@ -1092,7 +1093,7 @@ void kfd_suspend_all_processes(void)
>   		cancel_delayed_work_sync(&p->eviction_work);
>   		cancel_delayed_work_sync(&p->restore_work);
>   
> -		if (kfd_process_evict_queues(p))
> +		if (kfd_process_evict_queues(p, pre_reset))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);
>   		dma_fence_signal(p->ef);
>   		dma_fence_put(p->ef);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		/* destroy kernel queue (DIQ) */
>   		dqm = pqn->kq->dev->dqm;
>   		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> -		kernel_queue_uninit(pqn->kq);
> +		kernel_queue_uninit(pqn->kq, false);
>   	}
>   
>   	if (pqn->q) {
> --
> 2.24.1
>


More information about the amd-gfx mailing list