[PATCH v5 3/11] drm/amdkfd: Add GPU recoverable fault SMI event

Felix Kuehling felix.kuehling at amd.com
Thu Jun 30 14:19:22 UTC 2022


Am 2022-06-28 um 10:50 schrieb Philip Yang:
> Use ktime_get_boottime_ns() as timestamp to correlate with other
> APIs. Output timestamp when GPU recoverable fault starts and ends to
> recover the fault, if migration happened or only GPU page table is
> updated to recover, fault address, if read or write fault.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17 +++++++++++++++++
>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  6 +++++-
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c        | 17 +++++++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_svm.h        |  2 +-
>   4 files changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 55ed026435e2..b7e68283925f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
>   			  task_info.pid, task_info.task_name);
>   }
>   
> +void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
> +				    unsigned long address, bool write_fault,
> +				    ktime_t ts)
> +{
> +	kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
> +			  "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
> +			  address, dev->id, write_fault ? 'W' : 'R');
> +}
> +
> +void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
> +				  unsigned long address, bool migration)
> +{
> +	kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
> +			  "%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
> +			  pid, address, dev->id, migration ? 'M' : 'U');
> +}
> +
>   int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
>   {
>   	struct kfd_smi_client *client;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> index dfe101c21166..7903718cd9eb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
> @@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
>   void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
>   					     uint64_t throttle_bitmask);
>   void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
> -
> +void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
> +				    unsigned long address, bool write_fault,
> +				    ktime_t ts);
> +void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
> +				  unsigned long address, bool migration);
>   #endif
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index d6fc00d51c8c..2ad08a1f38dd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -32,6 +32,7 @@
>   #include "kfd_priv.h"
>   #include "kfd_svm.h"
>   #include "kfd_migrate.h"
> +#include "kfd_smi_events.h"
>   
>   #ifdef dev_fmt
>   #undef dev_fmt
> @@ -1617,7 +1618,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
>   	svm_range_unreserve_bos(&ctx);
>   
>   	if (!r)
> -		prange->validate_timestamp = ktime_to_us(ktime_get());
> +		prange->validate_timestamp = ktime_get_boottime();
>   
>   	return r;
>   }
> @@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   	struct svm_range_list *svms;
>   	struct svm_range *prange;
>   	struct kfd_process *p;
> -	uint64_t timestamp;
> +	ktime_t timestamp = ktime_get_boottime();
>   	int32_t best_loc;
>   	int32_t gpuidx = MAX_GPU_INSTANCE;
>   	bool write_locked = false;
>   	struct vm_area_struct *vma;
> +	bool migration = false;
>   	int r = 0;
>   
>   	if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
> @@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   		goto out_unlock_range;
>   	}
>   
> -	timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
>   	/* skip duplicate vm fault on different pages of same range */
> -	if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
> +	if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
> +				AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {

You changed the timestamp units from us to ns. I think you'll need to 
update AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (multiply with 1000) to 
account for that.

Other than that, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>


>   		pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
>   			 svms, prange->start, prange->last);
>   		r = 0;
> @@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   		 svms, prange->start, prange->last, best_loc,
>   		 prange->actual_loc);
>   
> +	kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
> +				       write_fault, timestamp);
> +
>   	if (prange->actual_loc != best_loc) {
> +		migration = true;
>   		if (best_loc) {
>   			r = svm_migrate_to_vram(prange, best_loc, mm);
>   			if (r) {
> @@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
>   		pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
>   			 r, svms, prange->start, prange->last);
>   
> +	kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
> +				     migration);
> +
>   out_unlock_range:
>   	mutex_unlock(&prange->migrate_mutex);
>   out_unlock_svms:
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index 2d54147b4dda..eab7f6d3b13c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -125,7 +125,7 @@ struct svm_range {
>   	uint32_t			actual_loc;
>   	uint8_t				granularity;
>   	atomic_t			invalid;
> -	uint64_t			validate_timestamp;
> +	ktime_t				validate_timestamp;
>   	struct mmu_interval_notifier	notifier;
>   	struct svm_work_list_item	work_item;
>   	struct list_head		deferred_list;


More information about the amd-gfx mailing list