[PATCH v5 5/11] drm/amdkfd: Add user queue eviction restore SMI event

Felix Kuehling felix.kuehling at amd.com
Thu Jun 30 14:36:08 UTC 2022


Am 2022-06-28 um 10:50 schrieb Philip Yang:
> Output user queue eviction and restore event. User queue eviction may be
> triggered by svm or userptr MMU notifier, TTM eviction, device suspend
> and CRIU checkpoint and restore.
>
> User queue restore may be rescheduled if eviction happens again while
> restore.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 12 ++++---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  4 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  4 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 15 ++++++--
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c   | 35 +++++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h   |  4 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  6 ++--
>   9 files changed, 69 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b25b41f50213..73bf8b5f2aa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
>   }
>   #endif
>   /* KGD2KFD callbacks */
> -int kgd2kfd_quiesce_mm(struct mm_struct *mm);
> +int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
>   int kgd2kfd_resume_mm(struct mm_struct *mm);
>   int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>   						struct dma_fence *fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 5ba9070d8722..6a7e045ddcc5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -32,6 +32,7 @@
>   #include "amdgpu_dma_buf.h"
>   #include <uapi/linux/kfd_ioctl.h>
>   #include "amdgpu_xgmi.h"
> +#include "kfd_smi_events.h"
>   
>   /* Userptr restore delay, just long enough to allow consecutive VM
>    * changes to accumulate
> @@ -2381,7 +2382,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
>   	evicted_bos = atomic_inc_return(&process_info->evicted_bos);
>   	if (evicted_bos == 1) {
>   		/* First eviction, stop the queues */
> -		r = kgd2kfd_quiesce_mm(mm);
> +		r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
>   		if (r)
>   			pr_err("Failed to quiesce KFD\n");
>   		schedule_delayed_work(&process_info->restore_userptr_work,
> @@ -2655,13 +2656,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>   
>   unlock_out:
>   	mutex_unlock(&process_info->lock);
> -	mmput(mm);
> -	put_task_struct(usertask);
>   
>   	/* If validation failed, reschedule another attempt */
> -	if (evicted_bos)
> +	if (evicted_bos) {
>   		schedule_delayed_work(&process_info->restore_userptr_work,
>   			msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
> +
> +		kfd_smi_event_queue_restore_rescheduled(mm);
> +	}
> +	mmput(mm);
> +	put_task_struct(usertask);
>   }
>   
>   /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index a0246b4bae6b..6abfe10229a2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2428,7 +2428,7 @@ static int criu_restore(struct file *filep,
>   	 * Set the process to evicted state to avoid running any new queues before all the memory
>   	 * mappings are ready.
>   	 */
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
>   	if (ret)
>   		goto exit_unlock;
>   
> @@ -2547,7 +2547,7 @@ static int criu_process_info(struct file *filep,
>   		goto err_unlock;
>   	}
>   
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
>   	if (ret)
>   		goto err_unlock;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c8fee0dbfdcb..6ec0e9f0927d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
>   	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
>   }
>   
> -int kgd2kfd_quiesce_mm(struct mm_struct *mm)
> +int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
>   {
>   	struct kfd_process *p;
>   	int r;
> @@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>   		return -ESRCH;
>   
>   	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
> -	r = kfd_process_evict_queues(p);
> +	r = kfd_process_evict_queues(p, trigger);
>   
>   	kfd_unref_process(p);
>   	return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 59ba50ce54d3..b9e7e9c52853 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -946,7 +946,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
>   }
>   
>   void kfd_unref_process(struct kfd_process *p);
> -int kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
>   int kfd_process_restore_queues(struct kfd_process *p);
>   void kfd_suspend_all_processes(void);
>   int kfd_resume_all_processes(void);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a13e60d48b73..fc38a4d81420 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -43,6 +43,7 @@ struct mm_struct;
>   #include "kfd_device_queue_manager.h"
>   #include "kfd_iommu.h"
>   #include "kfd_svm.h"
> +#include "kfd_smi_events.h"
>   
>   /*
>    * List of struct kfd_process (field kfd_process).
> @@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
>    * Eviction is reference-counted per process-device. This means multiple
>    * evictions from different sources can be nested safely.
>    */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
>   {
>   	int r = 0;
>   	int i;
> @@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_process_device *pdd = p->pdds[i];
>   
> +		kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
> +					     trigger);
> +
>   		r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>   							    &pdd->qpd);
>   		/* evict return -EIO if HWS is hang or asic is resetting, in this case
> @@ -1769,6 +1773,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
>   
>   		if (n_evicted == 0)
>   			break;
> +
> +		kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
> +
>   		if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
>   							      &pdd->qpd))
>   			pr_err("Failed to restore queues\n");
> @@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
>   	for (i = 0; i < p->n_pdds; i++) {
>   		struct kfd_process_device *pdd = p->pdds[i];
>   
> +		kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
> +
>   		r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
>   							      &pdd->qpd);
>   		if (r) {
> @@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
>   	flush_delayed_work(&p->restore_work);
>   
>   	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> -	ret = kfd_process_evict_queues(p);
> +	ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
>   	if (!ret) {
>   		dma_fence_signal(p->ef);
>   		dma_fence_put(p->ef);
> @@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
>   		cancel_delayed_work_sync(&p->eviction_work);
>   		cancel_delayed_work_sync(&p->restore_work);
>   
> -		if (kfd_process_evict_queues(p))
> +		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>   			pr_err("Failed to suspend process 0x%x\n", p->pasid);
>   		dma_fence_signal(p->ef);
>   		dma_fence_put(p->ef);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index ec4d278c2a47..3917c38204d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
>   			  from, to, trigger);
>   }
>   
> +void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
> +				  uint32_t trigger)
> +{
> +	kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
> +			  "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
> +			  dev->id, trigger);
> +}
> +
> +void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
> +{
> +	kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
> +			  "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
> +			  dev->id);
> +}
> +
> +void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> +{
> +	struct kfd_process *p;
> +	int i;
> +
> +	p = kfd_lookup_process_by_mm(mm);
> +	if (!p)
> +		return;
> +
> +	for (i = 0; i < p->n_pdds; i++) {
> +		struct kfd_process_device *pdd = p->pdds[i];
> +
> +		kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
> +				  KFD_SMI_EVENT_QUEUE_RESTORE,
> +				  "%lld -%d %x %c\n", ktime_get_boottime_ns(),
> +				  p->lead_thread->pid, pdd->dev->id, 'R');
> +	}
> +	kfd_unref_process(p);
> +}
> +
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
>   {
>   	struct kfd_smi_client *client;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index ec5d74a2fef4..b23292637239 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
>   void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
>   			     unsigned long start, unsigned long end,
>   			     uint32_t from, uint32_t to, uint32_t trigger);
> +void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
> +				  uint32_t trigger);
> +void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
> +void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 5cead2a0e819..ddc1e4651919 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1730,14 +1730,16 @@ static void svm_range_restore_work(struct work_struct *work)
>   	mutex_unlock(&svms->lock);
>   	mmap_write_unlock(mm);
>   	mutex_unlock(&process_info->lock);
> -	mmput(mm);
>   
>   	/* If validation failed, reschedule another attempt */
>   	if (evicted_ranges) {
>   		pr_debug("reschedule to restore svm range\n");
>   		schedule_delayed_work(&svms->restore_work,
>   			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
> +
> +		kfd_smi_event_queue_restore_rescheduled(mm);
>   	}
> +	mmput(mm);
>   }
>   
>   /**
> @@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
>   			 prange->svms, prange->start, prange->last);
>   
>   		/* First eviction, stop the queues */
> -		r = kgd2kfd_quiesce_mm(mm);
> +		r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
>   		if (r)
>   			pr_debug("failed to quiesce KFD\n");
>   


More information about the amd-gfx mailing list