[PATCH v5 5/11] drm/amdkfd: Add user queue eviction restore SMI event
Felix Kuehling
felix.kuehling at amd.com
Thu Jun 30 14:36:08 UTC 2022
Am 2022-06-28 um 10:50 schrieb Philip Yang:
> Output user queue eviction and restore event. User queue eviction may be
> triggered by svm or userptr MMU notifier, TTM eviction, device suspend
> and CRIU checkpoint and restore.
>
> User queue restore may be rescheduled if eviction happens again while
> restore.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +-
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 ++++---
> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 +--
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 +--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +-
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 15 ++++++--
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 35 +++++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 4 +++
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 ++--
> 9 files changed, 69 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b25b41f50213..73bf8b5f2aa9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
> }
> #endif
> /* KGD2KFD callbacks */
> -int kgd2kfd_quiesce_mm(struct mm_struct *mm);
> +int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
> int kgd2kfd_resume_mm(struct mm_struct *mm);
> int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
> struct dma_fence *fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 5ba9070d8722..6a7e045ddcc5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -32,6 +32,7 @@
> #include "amdgpu_dma_buf.h"
> #include <uapi/linux/kfd_ioctl.h>
> #include "amdgpu_xgmi.h"
> +#include "kfd_smi_events.h"
>
> /* Userptr restore delay, just long enough to allow consecutive VM
> * changes to accumulate
> @@ -2381,7 +2382,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
> evicted_bos = atomic_inc_return(&process_info->evicted_bos);
> if (evicted_bos == 1) {
> /* First eviction, stop the queues */
> - r = kgd2kfd_quiesce_mm(mm);
> + r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
> if (r)
> pr_err("Failed to quiesce KFD\n");
> schedule_delayed_work(&process_info->restore_userptr_work,
> @@ -2655,13 +2656,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
>
> unlock_out:
> mutex_unlock(&process_info->lock);
> - mmput(mm);
> - put_task_struct(usertask);
>
> /* If validation failed, reschedule another attempt */
> - if (evicted_bos)
> + if (evicted_bos) {
> schedule_delayed_work(&process_info->restore_userptr_work,
> msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
> +
> + kfd_smi_event_queue_restore_rescheduled(mm);
> + }
> + mmput(mm);
> + put_task_struct(usertask);
> }
>
> /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index a0246b4bae6b..6abfe10229a2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -2428,7 +2428,7 @@ static int criu_restore(struct file *filep,
> * Set the process to evicted state to avoid running any new queues before all the memory
> * mappings are ready.
> */
> - ret = kfd_process_evict_queues(p);
> + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
> if (ret)
> goto exit_unlock;
>
> @@ -2547,7 +2547,7 @@ static int criu_process_info(struct file *filep,
> goto err_unlock;
> }
>
> - ret = kfd_process_evict_queues(p);
> + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
> if (ret)
> goto err_unlock;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c8fee0dbfdcb..6ec0e9f0927d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
> spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
> }
>
> -int kgd2kfd_quiesce_mm(struct mm_struct *mm)
> +int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
> {
> struct kfd_process *p;
> int r;
> @@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
> return -ESRCH;
>
> WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
> - r = kfd_process_evict_queues(p);
> + r = kfd_process_evict_queues(p, trigger);
>
> kfd_unref_process(p);
> return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 59ba50ce54d3..b9e7e9c52853 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -946,7 +946,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
> }
>
> void kfd_unref_process(struct kfd_process *p);
> -int kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
> int kfd_process_restore_queues(struct kfd_process *p);
> void kfd_suspend_all_processes(void);
> int kfd_resume_all_processes(void);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a13e60d48b73..fc38a4d81420 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -43,6 +43,7 @@ struct mm_struct;
> #include "kfd_device_queue_manager.h"
> #include "kfd_iommu.h"
> #include "kfd_svm.h"
> +#include "kfd_smi_events.h"
>
> /*
> * List of struct kfd_process (field kfd_process).
> @@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
> * Eviction is reference-counted per process-device. This means multiple
> * evictions from different sources can be nested safely.
> */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
> {
> int r = 0;
> int i;
> @@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
> for (i = 0; i < p->n_pdds; i++) {
> struct kfd_process_device *pdd = p->pdds[i];
>
> + kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
> + trigger);
> +
> r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
> &pdd->qpd);
> /* evict return -EIO if HWS is hang or asic is resetting, in this case
> @@ -1769,6 +1773,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
>
> if (n_evicted == 0)
> break;
> +
> + kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
> +
> if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
> &pdd->qpd))
> pr_err("Failed to restore queues\n");
> @@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
> for (i = 0; i < p->n_pdds; i++) {
> struct kfd_process_device *pdd = p->pdds[i];
>
> + kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
> +
> r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
> &pdd->qpd);
> if (r) {
> @@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
> flush_delayed_work(&p->restore_work);
>
> pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> - ret = kfd_process_evict_queues(p);
> + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
> if (!ret) {
> dma_fence_signal(p->ef);
> dma_fence_put(p->ef);
> @@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
> cancel_delayed_work_sync(&p->eviction_work);
> cancel_delayed_work_sync(&p->restore_work);
>
> - if (kfd_process_evict_queues(p))
> + if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
> pr_err("Failed to suspend process 0x%x\n", p->pasid);
> dma_fence_signal(p->ef);
> dma_fence_put(p->ef);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index ec4d278c2a47..3917c38204d0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
> from, to, trigger);
> }
>
> +void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
> + uint32_t trigger)
> +{
> + kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
> + "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
> + dev->id, trigger);
> +}
> +
> +void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
> +{
> + kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
> + "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
> + dev->id);
> +}
> +
> +void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
> +{
> + struct kfd_process *p;
> + int i;
> +
> + p = kfd_lookup_process_by_mm(mm);
> + if (!p)
> + return;
> +
> + for (i = 0; i < p->n_pdds; i++) {
> + struct kfd_process_device *pdd = p->pdds[i];
> +
> + kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
> + KFD_SMI_EVENT_QUEUE_RESTORE,
> + "%lld -%d %x %c\n", ktime_get_boottime_ns(),
> + p->lead_thread->pid, pdd->dev->id, 'R');
> + }
> + kfd_unref_process(p);
> +}
> +
> int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> {
> struct kfd_smi_client *client;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index ec5d74a2fef4..b23292637239 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
> void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
> unsigned long start, unsigned long end,
> uint32_t from, uint32_t to, uint32_t trigger);
> +void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
> + uint32_t trigger);
> +void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
> +void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index 5cead2a0e819..ddc1e4651919 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1730,14 +1730,16 @@ static void svm_range_restore_work(struct work_struct *work)
> mutex_unlock(&svms->lock);
> mmap_write_unlock(mm);
> mutex_unlock(&process_info->lock);
> - mmput(mm);
>
> /* If validation failed, reschedule another attempt */
> if (evicted_ranges) {
> pr_debug("reschedule to restore svm range\n");
> schedule_delayed_work(&svms->restore_work,
> msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
> +
> + kfd_smi_event_queue_restore_rescheduled(mm);
> }
> + mmput(mm);
> }
>
> /**
> @@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
> prange->svms, prange->start, prange->last);
>
> /* First eviction, stop the queues */
> - r = kgd2kfd_quiesce_mm(mm);
> + r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
> if (r)
> pr_debug("failed to quiesce KFD\n");
>
More information about the amd-gfx
mailing list