[PATCH v1 2/7] drm/amdgpu: don't report stale vm_fault info in devcoredump
Pierre-Eric Pelloux-Prayer
pierre-eric.pelloux-prayer at amd.com
Wed May 21 09:49:04 UTC 2025
The coredump needs to contain accurate data and reporting a page
fault from a previous issue is incorrect.
Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 13 ++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++++
3 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index de70747a099d..6fa53e070b50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -273,11 +273,13 @@ __amdgpu_devcoredump_read(char *buffer, size_t count, struct amdgpu_coredump_inf
}
/* Add page fault information */
- fault_info = &coredump->adev->vm_manager.fault_info;
- drm_printf(&p, "\n[%s] Page fault observed\n",
- fault_info->vmhub ? "mmhub" : "gfxhub");
- drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
- drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
+ fault_info = &coredump->fault_info;
+ if (fault_info->status != 0) {
+ drm_printf(&p, "\n[%s] Page fault observed\n",
+ fault_info->vmhub ? "mmhub" : "gfxhub");
+ drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
+ drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
+ }
/* dump the ip state for each ip */
drm_printf(&p, "IP Dump\n");
@@ -377,6 +379,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
coredump->skip_vram_check = skip_vram_check;
coredump->reset_vram_lost = vram_lost;
+ coredump->fault_info = adev->vm_manager.fault_info;
if (job && job->pasid) {
struct amdgpu_task_info *ti;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
index 33f2f6fdfcf7..38ccdd3d6213 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
@@ -37,6 +37,7 @@ struct amdgpu_coredump_info {
struct timespec64 reset_time;
bool skip_vram_check;
bool reset_vram_lost;
+ struct amdgpu_vm_fault_info fault_info;
struct amdgpu_ring *ring;
/* Readable form of coredevdump, generate once to speed up
* reading it (see drm_coredump_printer's documentation).
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index acb21fc8b3ce..5ee9d2cd74e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -199,6 +199,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
exit:
drm_dev_exit(idx);
+
+ /* Clear fault info to avoid reporting the same fault. */
+ adev->vm_manager.fault_info.status = 0;
+ adev->vm_manager.fault_info.addr = 0;
+
return DRM_GPU_SCHED_STAT_NOMINAL;
}
--
2.43.0
More information about the amd-gfx
mailing list