[PATCH v5 3/11] drm/amdkfd: Add GPU recoverable fault SMI event
Felix Kuehling
felix.kuehling at amd.com
Thu Jun 30 14:19:22 UTC 2022
Am 2022-06-28 um 10:50 schrieb Philip Yang:
> Use ktime_get_boottime_ns() as timestamp to correlate with other
> APIs. Output timestamp when GPU recoverable fault starts and ends to
> recover the fault, if migration happened or only GPU page table is
> updated to recover, fault address, if read or write fault.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17 +++++++++++++++++
> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 6 +++++-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 17 +++++++++++++----
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
> 4 files changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 55ed026435e2..b7e68283925f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
> task_info.pid, task_info.task_name);
> }
>
> +void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
> + unsigned long address, bool write_fault,
> + ktime_t ts)
> +{
> + kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
> + "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
> + address, dev->id, write_fault ? 'W' : 'R');
> +}
> +
> +void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
> + unsigned long address, bool migration)
> +{
> + kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
> + "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
> + pid, address, dev->id, migration ? 'M' : 'U');
> +}
> +
> int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
> {
> struct kfd_smi_client *client;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index dfe101c21166..7903718cd9eb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
> void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
> uint64_t throttle_bitmask);
> void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
> -
> +void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
> + unsigned long address, bool write_fault,
> + ktime_t ts);
> +void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
> + unsigned long address, bool migration);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index d6fc00d51c8c..2ad08a1f38dd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -32,6 +32,7 @@
> #include "kfd_priv.h"
> #include "kfd_svm.h"
> #include "kfd_migrate.h"
> +#include "kfd_smi_events.h"
>
> #ifdef dev_fmt
> #undef dev_fmt
> @@ -1617,7 +1618,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
> svm_range_unreserve_bos(&ctx);
>
> if (!r)
> - prange->validate_timestamp = ktime_to_us(ktime_get());
> + prange->validate_timestamp = ktime_get_boottime();
>
> return r;
> }
> @@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> struct svm_range_list *svms;
> struct svm_range *prange;
> struct kfd_process *p;
> - uint64_t timestamp;
> + ktime_t timestamp = ktime_get_boottime();
> int32_t best_loc;
> int32_t gpuidx = MAX_GPU_INSTANCE;
> bool write_locked = false;
> struct vm_area_struct *vma;
> + bool migration = false;
> int r = 0;
>
> if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
> @@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> goto out_unlock_range;
> }
>
> - timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
> /* skip duplicate vm fault on different pages of same range */
> - if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
> + if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
> + AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
You changed the timestamp units from us to ns. I think you'll need to
update AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (multiply with 1000) to
account for that.
Other than that, this patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
> svms, prange->start, prange->last);
> r = 0;
> @@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> svms, prange->start, prange->last, best_loc,
> prange->actual_loc);
>
> + kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
> + write_fault, timestamp);
> +
> if (prange->actual_loc != best_loc) {
> + migration = true;
> if (best_loc) {
> r = svm_migrate_to_vram(prange, best_loc, mm);
> if (r) {
> @@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
> r, svms, prange->start, prange->last);
>
> + kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
> + migration);
> +
> out_unlock_range:
> mutex_unlock(&prange->migrate_mutex);
> out_unlock_svms:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 2d54147b4dda..eab7f6d3b13c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -125,7 +125,7 @@ struct svm_range {
> uint32_t actual_loc;
> uint8_t granularity;
> atomic_t invalid;
> - uint64_t validate_timestamp;
> + ktime_t validate_timestamp;
> struct mmu_interval_notifier notifier;
> struct svm_work_list_item work_item;
> struct list_head deferred_list;
More information about the amd-gfx
mailing list