[PATCH 3/4] drm/amdgpu: skip printing vram_lost if needed

Alex Deucher alexdeucher at gmail.com
Fri Aug 16 13:52:13 UTC 2024


On Fri, Aug 16, 2024 at 3:55 AM <Trigger.Huang at amd.com> wrote:
>
> From: Trigger Huang <Trigger.Huang at amd.com>
>
> The vm lost status can only be obtained after a GPU reset occurs, but
> sometimes a dev core dump can be happened before GPU reset. So a new
> argument is added to tell the dev core dump implementation whether to
> skip printing the vram_lost status in the dump.
> And this patch is also trying to decouple the core dump function from
> the GPU reset function, by replacing the argument amdgpu_reset_context
> with amdgpu_job to specify the context for core dump.
>
> Signed-off-by: Trigger Huang <Trigger.Huang at amd.com>
> Suggested-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 19 ++++++++++---------
>  .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h  |  6 +++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  2 +-
>  3 files changed, 14 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> index cf2b4dd4d865..a860f52d8bb0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> @@ -28,8 +28,9 @@
>  #include "atom.h"
>
>  #ifndef CONFIG_DEV_COREDUMP
> -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> -                    struct amdgpu_reset_context *reset_context)
> +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
> +                    bool vram_lost, struct amdgpu_job *job)
> +
>  {
>  }
>  #else
> @@ -315,7 +316,7 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
>                 }
>         }
>
> -       if (coredump->reset_vram_lost)
> +       if (!(coredump->skip_vram_check) && coredump->reset_vram_lost)
>                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");

You might want to say something like:
drm_printf(&p, "VRAM lost status skipped\n");
in the skip case so we know that we skipped it so users don't assume
it wasn't lost.

Alex

>
>         return count - iter.remain;
> @@ -326,12 +327,11 @@ static void amdgpu_devcoredump_free(void *data)
>         kfree(data);
>  }
>
> -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> -                    struct amdgpu_reset_context *reset_context)
> +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
> +                    bool vram_lost, struct amdgpu_job *job)
>  {
> -       struct amdgpu_coredump_info *coredump;
>         struct drm_device *dev = adev_to_drm(adev);
> -       struct amdgpu_job *job = reset_context->job;
> +       struct amdgpu_coredump_info *coredump;
>         struct drm_sched_job *s_job;
>
>         coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
> @@ -341,11 +341,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>                 return;
>         }
>
> +       coredump->skip_vram_check = skip_vram_check;
>         coredump->reset_vram_lost = vram_lost;
>
> -       if (reset_context->job && reset_context->job->vm) {
> +       if (job && job->vm) {
>                 struct amdgpu_task_info *ti;
> -               struct amdgpu_vm *vm = reset_context->job->vm;
> +               struct amdgpu_vm *vm = job->vm;
>
>                 ti = amdgpu_vm_get_task_info_vm(vm);
>                 if (ti) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> index 52459512cb2b..c4e522e49251 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> @@ -26,7 +26,6 @@
>  #define __AMDGPU_DEV_COREDUMP_H__
>
>  #include "amdgpu.h"
> -#include "amdgpu_reset.h"
>
>  #ifdef CONFIG_DEV_COREDUMP
>
> @@ -36,12 +35,13 @@ struct amdgpu_coredump_info {
>         struct amdgpu_device            *adev;
>         struct amdgpu_task_info         reset_task_info;
>         struct timespec64               reset_time;
> +       bool                            skip_vram_check;
>         bool                            reset_vram_lost;
>         struct amdgpu_ring              *ring;
>  };
>  #endif
>
> -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> -                    struct amdgpu_reset_context *reset_context);
> +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
> +                    bool vram_lost, struct amdgpu_job *job);
>
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9885d0606b0a..825cc62cd75d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5445,7 +5445,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>
>                                 if (amdgpu_gpu_coredump && (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)))
> -                                       amdgpu_coredump(tmp_adev, vram_lost, reset_context);
> +                                       amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
>
>                                 if (vram_lost) {
>                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
> --
> 2.34.1
>


More information about the amd-gfx mailing list