<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body>
<p><br>
</p>
<div class="moz-cite-prefix">On 2022-06-30 10:19, Felix Kuehling
wrote:<br>
</div>
<blockquote type="cite" cite="mid:7c8b25a1-2937-18b6-c329-e901673ae691@amd.com">
<br>
Am 2022-06-28 um 10:50 schrieb Philip Yang:
<br>
<blockquote type="cite">Use ktime_get_boottime_ns() as timestamp
to correlate with other
<br>
APIs. Output timestamp when GPU recoverable fault starts and
ends to
<br>
recover the fault, if migration happened or only GPU page table
is
<br>
updated to recover, fault address, if read or write fault.
<br>
<br>
Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
<br>
---
<br>
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17
+++++++++++++++++
<br>
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 6 +++++-
<br>
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 17
+++++++++++++----
<br>
drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 2 +-
<br>
4 files changed, 36 insertions(+), 6 deletions(-)
<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
<br>
index 55ed026435e2..b7e68283925f 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
<br>
@@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct
kfd_dev *dev, uint16_t pasid)
<br>
task_info.pid, task_info.task_name);
<br>
}
<br>
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev,
pid_t pid,
<br>
+ unsigned long address, bool write_fault,
<br>
+ ktime_t ts)
<br>
+{
<br>
+ kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
<br>
+ "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
<br>
+ address, dev->id, write_fault ? 'W' : 'R');
<br>
+}
<br>
+
<br>
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t
pid,
<br>
+ unsigned long address, bool migration)
<br>
+{
<br>
+ kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
<br>
+ "%lld -%d @%lx(%x) %c\n",
ktime_get_boottime_ns(),
<br>
+ pid, address, dev->id, migration ? 'M' : 'U');
<br>
+}
<br>
+
<br>
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
<br>
{
<br>
struct kfd_smi_client *client;
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
<br>
index dfe101c21166..7903718cd9eb 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
<br>
@@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct
kfd_dev *dev, uint16_t pasid);
<br>
void kfd_smi_event_update_thermal_throttling(struct kfd_dev
*dev,
<br>
uint64_t throttle_bitmask);
<br>
void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool
post_reset);
<br>
-
<br>
+void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t
pid,
<br>
+ unsigned long address, bool write_fault,
<br>
+ ktime_t ts);
<br>
+void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t
pid,
<br>
+ unsigned long address, bool migration);
<br>
#endif
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<br>
index d6fc00d51c8c..2ad08a1f38dd 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
<br>
@@ -32,6 +32,7 @@
<br>
#include "kfd_priv.h"
<br>
#include "kfd_svm.h"
<br>
#include "kfd_migrate.h"
<br>
+#include "kfd_smi_events.h"
<br>
#ifdef dev_fmt
<br>
#undef dev_fmt
<br>
@@ -1617,7 +1618,7 @@ static int
svm_range_validate_and_map(struct mm_struct *mm,
<br>
svm_range_unreserve_bos(&ctx);
<br>
if (!r)
<br>
- prange->validate_timestamp =
ktime_to_us(ktime_get());
<br>
+ prange->validate_timestamp = ktime_get_boottime();
<br>
return r;
<br>
}
<br>
@@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct
amdgpu_device *adev, unsigned int pasid,
<br>
struct svm_range_list *svms;
<br>
struct svm_range *prange;
<br>
struct kfd_process *p;
<br>
- uint64_t timestamp;
<br>
+ ktime_t timestamp = ktime_get_boottime();
<br>
int32_t best_loc;
<br>
int32_t gpuidx = MAX_GPU_INSTANCE;
<br>
bool write_locked = false;
<br>
struct vm_area_struct *vma;
<br>
+ bool migration = false;
<br>
int r = 0;
<br>
if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
<br>
@@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct
amdgpu_device *adev, unsigned int pasid,
<br>
goto out_unlock_range;
<br>
}
<br>
- timestamp = ktime_to_us(ktime_get()) -
prange->validate_timestamp;
<br>
/* skip duplicate vm fault on different pages of same
range */
<br>
- if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
<br>
+ if (ktime_before(timestamp,
ktime_add_ns(prange->validate_timestamp,
<br>
+ AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
<br>
</blockquote>
<br>
You changed the timestamp units from us to ns. I think you'll need
to update AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (multiply with
1000) to account for that.
<br>
</blockquote>
<p>Thanks for catching this, the change was in v4, I removed it by
mistake when updating comments.</p>
<p>Philip<br>
</p>
<blockquote type="cite" cite="mid:7c8b25a1-2937-18b6-c329-e901673ae691@amd.com">
<br>
Other than that, this patch is
<br>
<br>
Reviewed-by: Felix Kuehling <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a>
<br>
<br>
<br>
<blockquote type="cite"> pr_debug("svms 0x%p [0x%lx %lx]
already restored\n",
<br>
svms, prange->start, prange->last);
<br>
r = 0;
<br>
@@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct
amdgpu_device *adev, unsigned int pasid,
<br>
svms, prange->start, prange->last, best_loc,
<br>
prange->actual_loc);
<br>
+ kfd_smi_event_page_fault_start(adev->kfd.dev,
p->lead_thread->pid, addr,
<br>
+ write_fault, timestamp);
<br>
+
<br>
if (prange->actual_loc != best_loc) {
<br>
+ migration = true;
<br>
if (best_loc) {
<br>
r = svm_migrate_to_vram(prange, best_loc, mm);
<br>
if (r) {
<br>
@@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct
amdgpu_device *adev, unsigned int pasid,
<br>
pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to
gpus\n",
<br>
r, svms, prange->start, prange->last);
<br>
+ kfd_smi_event_page_fault_end(adev->kfd.dev,
p->lead_thread->pid, addr,
<br>
+ migration);
<br>
+
<br>
out_unlock_range:
<br>
mutex_unlock(&prange->migrate_mutex);
<br>
out_unlock_svms:
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
<br>
index 2d54147b4dda..eab7f6d3b13c 100644
<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
<br>
@@ -125,7 +125,7 @@ struct svm_range {
<br>
uint32_t actual_loc;
<br>
uint8_t granularity;
<br>
atomic_t invalid;
<br>
- uint64_t validate_timestamp;
<br>
+ ktime_t validate_timestamp;
<br>
struct mmu_interval_notifier notifier;
<br>
struct svm_work_list_item work_item;
<br>
struct list_head deferred_list;
<br>
</blockquote>
</blockquote>
</body>
</html>