[PATCH v1 2/7] drm/amdgpu: don't report stale vm_fault info in devcoredump

Pierre-Eric Pelloux-Prayer pierre-eric.pelloux-prayer at amd.com
Wed May 21 09:49:04 UTC 2025


The coredump needs to contain accurate data and reporting a page
fault from a previous issue is incorrect.

Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 13 ++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c          |  5 +++++
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
index de70747a099d..6fa53e070b50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
@@ -273,11 +273,13 @@ __amdgpu_devcoredump_read(char *buffer, size_t count, struct amdgpu_coredump_inf
 	}
 
 	/* Add page fault information */
-	fault_info = &coredump->adev->vm_manager.fault_info;
-	drm_printf(&p, "\n[%s] Page fault observed\n",
-		   fault_info->vmhub ? "mmhub" : "gfxhub");
-	drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
-	drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
+	fault_info = &coredump->fault_info;
+	if (fault_info->status != 0) {
+		drm_printf(&p, "\n[%s] Page fault observed\n",
+			   fault_info->vmhub ? "mmhub" : "gfxhub");
+		drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
+		drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
+	}
 
 	/* dump the ip state for each ip */
 	drm_printf(&p, "IP Dump\n");
@@ -377,6 +379,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
 
 	coredump->skip_vram_check = skip_vram_check;
 	coredump->reset_vram_lost = vram_lost;
+	coredump->fault_info = adev->vm_manager.fault_info;
 
 	if (job && job->pasid) {
 		struct amdgpu_task_info *ti;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
index 33f2f6fdfcf7..38ccdd3d6213 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
@@ -37,6 +37,7 @@ struct amdgpu_coredump_info {
 	struct timespec64               reset_time;
 	bool                            skip_vram_check;
 	bool                            reset_vram_lost;
+	struct amdgpu_vm_fault_info	fault_info;
 	struct amdgpu_ring              *ring;
 	/* Readable form of coredevdump, generate once to speed up
 	 * reading it (see drm_coredump_printer's documentation).
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index acb21fc8b3ce..5ee9d2cd74e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -199,6 +199,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 
 exit:
 	drm_dev_exit(idx);
+
+	/* Clear fault info to avoid reporting the same fault. */
+	adev->vm_manager.fault_info.status = 0;
+	adev->vm_manager.fault_info.addr = 0;
+
 	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
 
-- 
2.43.0



More information about the amd-gfx mailing list