[PATCH v1 2/7] drm/amdgpu: don't report stale vm_fault info in devcoredump
Christian König
christian.koenig at amd.com
Wed May 21 11:44:47 UTC 2025
On 5/21/25 11:49, Pierre-Eric Pelloux-Prayer wrote:
> The coredump needs to contain accurate data and reporting a page
> fault from a previous issue is incorrect.
>
> Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 13 ++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++++
> 3 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> index de70747a099d..6fa53e070b50 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> @@ -273,11 +273,13 @@ __amdgpu_devcoredump_read(char *buffer, size_t count, struct amdgpu_coredump_inf
> }
>
> /* Add page fault information */
> - fault_info = &coredump->adev->vm_manager.fault_info;
> - drm_printf(&p, "\n[%s] Page fault observed\n",
> - fault_info->vmhub ? "mmhub" : "gfxhub");
> - drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
> - drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
> + fault_info = &coredump->fault_info;
> + if (fault_info->status != 0) {
> + drm_printf(&p, "\n[%s] Page fault observed\n",
> + fault_info->vmhub ? "mmhub" : "gfxhub");
> + drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
> + drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
> + }
>
> /* dump the ip state for each ip */
> drm_printf(&p, "IP Dump\n");
> @@ -377,6 +379,7 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
>
> coredump->skip_vram_check = skip_vram_check;
> coredump->reset_vram_lost = vram_lost;
> + coredump->fault_info = adev->vm_manager.fault_info;
>
> if (job && job->pasid) {
> struct amdgpu_task_info *ti;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> index 33f2f6fdfcf7..38ccdd3d6213 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> @@ -37,6 +37,7 @@ struct amdgpu_coredump_info {
> struct timespec64 reset_time;
> bool skip_vram_check;
> bool reset_vram_lost;
> + struct amdgpu_vm_fault_info fault_info;
> struct amdgpu_ring *ring;
> /* Readable form of coredevdump, generate once to speed up
> * reading it (see drm_coredump_printer's documentation).
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index acb21fc8b3ce..5ee9d2cd74e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -199,6 +199,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>
> exit:
> drm_dev_exit(idx);
> +
> + /* Clear fault info to avoid reporting the same fault. */
> + adev->vm_manager.fault_info.status = 0;
> + adev->vm_manager.fault_info.addr = 0;
> +
That needs to come much earlier and prefereable while holding a lock.
Apart from that looks good to me.
Christian.
> return DRM_GPU_SCHED_STAT_NOMINAL;
> }
>
More information about the amd-gfx
mailing list