[PATCH 4/5] drm/amdkfd: add page fault SMI event

Philip Yang Philip.Yang at amd.com
Fri Jan 14 20:32:55 UTC 2022


After GPU page fault is recovered, output timestamp when fault is
received, duration to recover the fault, if migration or only
GPU page table is updated, fault address, read or write fault.

Signed-off-by: Philip Yang <Philip.Yang at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 41 +++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c        | 12 ++++--
 3 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 5818ea8ad4ce..6ed3d85348d6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -265,6 +265,47 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 	add_event_to_kfifo(task_info.pid, dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
 }
 
+static bool kfd_smi_event_pid_duration(struct kfd_dev *dev, uint16_t pasid,
+				       pid_t *pid, uint64_t ts,
+				       uint64_t *duration)
+{
+	struct amdgpu_task_info task_info = {0};
+
+	if (list_empty(&dev->smi_clients))
+		return false;
+
+	amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
+	if (!task_info.pid) {
+		pr_debug("task is gone\n");
+		return false;
+	}
+	if (pid)
+		*pid = task_info.pid;
+	if (duration)
+		*duration = ktime_get_ns() - ts;
+	return true;
+}
+
+void kfd_smi_event_page_fault(struct kfd_dev *dev, uint16_t pasid,
+			      unsigned long address, bool migration,
+			      bool write_fault, uint64_t ts)
+{
+	char fifo_in[128];
+	uint64_t duration;
+	pid_t pid;
+	int len;
+
+	if (!kfd_smi_event_pid_duration(dev, pasid, &pid, ts, &duration))
+		return;
+
+	len = snprintf(fifo_in, sizeof(fifo_in), "%d ts=%lld duration=%lld"
+		       " pid=%d pfn=0x%lx write=%d migration=%d\n",
+		       KFD_SMI_EVENT_PAGE_FAULT, ts, duration, pid, address,
+		       write_fault, migration);
+
+	add_event_to_kfifo(pid, dev, KFD_SMI_EVENT_PAGE_FAULT, fifo_in, len);
+}
+
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 {
 	struct kfd_smi_client *client;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index bffd0c32b060..fa3a8fdad69f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -28,5 +28,8 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 					     uint64_t throttle_bitmask);
 void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
+void kfd_smi_event_page_fault(struct kfd_dev *dev, uint16_t pasid,
+			      unsigned long address, bool migration,
+			      bool write_fault, uint64_t ts);
 
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 37b3191615b6..b81667162dc1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -32,6 +32,7 @@
 #include "kfd_priv.h"
 #include "kfd_svm.h"
 #include "kfd_migrate.h"
+#include "kfd_smi_events.h"
 
 #ifdef dev_fmt
 #undef dev_fmt
@@ -2657,11 +2658,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 	struct svm_range_list *svms;
 	struct svm_range *prange;
 	struct kfd_process *p;
-	uint64_t timestamp;
+	uint64_t timestamp = ktime_get_ns();
 	int32_t best_loc;
 	int32_t gpuidx = MAX_GPU_INSTANCE;
 	bool write_locked = false;
 	struct vm_area_struct *vma;
+	bool migration = false;
 	int r = 0;
 
 	if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
@@ -2737,9 +2739,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		goto out_unlock_range;
 	}
 
-	timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
 	/* skip duplicate vm fault on different pages of same range */
-	if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
+	if ((ktime_to_us(timestamp) -  prange->validate_timestamp) <
+	    AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
 		pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
 			 svms, prange->start, prange->last);
 		r = 0;
@@ -2776,6 +2778,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		 prange->actual_loc);
 
 	if (prange->actual_loc != best_loc) {
+		migration = true;
 		if (best_loc) {
 			r = svm_migrate_to_vram(prange, best_loc, mm);
 			if (r) {
@@ -2804,6 +2807,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
 			 r, svms, prange->start, prange->last);
 
+	kfd_smi_event_page_fault(adev->kfd.dev, p->pasid, addr, migration,
+				 write_fault, timestamp);
+
 out_unlock_range:
 	mutex_unlock(&prange->migrate_mutex);
 out_unlock_svms:
-- 
2.17.1



More information about the amd-gfx mailing list