[PATCH v2 4/8] drm/amdkfd: Add GPU recoverable fault SMI event

Philip Yang Philip.Yang at amd.com
Thu Jan 20 23:13:18 UTC 2022


Output timestamp when GPU recoverable fault starts, ends and duration to
recover the fault, if migration happened or only GPU page table is
updated, fault address, read or write fault.

Signed-off-by: Philip Yang <Philip.Yang at amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 48 +++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h |  7 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c        | 17 ++++++--
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 68c93701c5f7..080eba0d3be0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -266,6 +266,54 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 	add_event_to_kfifo(0, dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
 }
 
+static bool kfd_smi_event_duration(struct kfd_dev *dev, uint64_t ts,
+				   uint64_t *duration)
+{
+	if (list_empty(&dev->smi_clients))
+		return false;
+
+	*duration = ktime_get_boottime_ns() - ts;
+	return true;
+}
+
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
+				    unsigned long address, bool write_fault,
+				    uint64_t ts)
+{
+	char fifo_in[64];
+	int len;
+
+	if (list_empty(&dev->smi_clients))
+		return;
+
+	len = snprintf(fifo_in, sizeof(fifo_in), "%x %lld -%d @%lx(%x) %c\n",
+		       KFD_SMI_EVENT_PAGE_FAULT_START, ts, pid, address,
+		       dev->id, write_fault ? 'W' : 'R');
+
+	add_event_to_kfifo(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START, fifo_in,
+			   len);
+}
+
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
+				  unsigned long address, bool migration,
+				  uint64_t ts)
+{
+	char fifo_in[64];
+	uint64_t duration;
+	int len;
+
+	if (!kfd_smi_event_duration(dev, ts, &duration))
+		return;
+
+	len = snprintf(fifo_in, sizeof(fifo_in),
+		       "%x %lld(%lld) -%d @%lx(%x) %c\n",
+		       KFD_SMI_EVENT_PAGE_FAULT_END, ktime_get_boottime_ns(),
+		       duration, pid, address, dev->id, migration ? 'M' : 'm');
+
+	add_event_to_kfifo(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END, fifo_in,
+			   len);
+}
+
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
 {
 	struct kfd_smi_client *client;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index bffd0c32b060..7f70db914d2c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -28,5 +28,10 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
 void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
 					     uint64_t throttle_bitmask);
 void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
-
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
+				    unsigned long address, bool write_fault,
+				    uint64_t ts);
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
+				  unsigned long address, bool migration,
+				  uint64_t ts);
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 2d2cae05dbea..08b21f9759ea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -32,6 +32,7 @@
 #include "kfd_priv.h"
 #include "kfd_svm.h"
 #include "kfd_migrate.h"
+#include "kfd_smi_events.h"
 
 #ifdef dev_fmt
 #undef dev_fmt
@@ -1596,7 +1597,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
 	svm_range_unreserve_bos(&ctx);
 
 	if (!r)
-		prange->validate_timestamp = ktime_to_us(ktime_get());
+		prange->validate_timestamp = ktime_get_boottime_ns();
 
 	return r;
 }
@@ -2665,11 +2666,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 	struct svm_range_list *svms;
 	struct svm_range *prange;
 	struct kfd_process *p;
-	uint64_t timestamp;
+	uint64_t timestamp = ktime_get_boottime_ns();
 	int32_t best_loc;
 	int32_t gpuidx = MAX_GPU_INSTANCE;
 	bool write_locked = false;
 	struct vm_area_struct *vma;
+	bool migration = false;
 	int r = 0;
 
 	if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
@@ -2745,9 +2747,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		goto out_unlock_range;
 	}
 
-	timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
 	/* skip duplicate vm fault on different pages of same range */
-	if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
+	if (div_u64(timestamp -  prange->validate_timestamp, 1000000) <
+	    AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
 		pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
 			 svms, prange->start, prange->last);
 		r = 0;
@@ -2783,7 +2785,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		 svms, prange->start, prange->last, best_loc,
 		 prange->actual_loc);
 
+	kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
+				       write_fault, timestamp);
+
 	if (prange->actual_loc != best_loc) {
+		migration = true;
 		if (best_loc) {
 			r = svm_migrate_to_vram(prange, best_loc, mm);
 			if (r) {
@@ -2812,6 +2818,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
 			 r, svms, prange->start, prange->last);
 
+	kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
+				     migration, timestamp);
+
 out_unlock_range:
 	mutex_unlock(&prange->migrate_mutex);
 out_unlock_svms:
-- 
2.17.1



More information about the amd-gfx mailing list