[PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's
Alex Deucher
alexdeucher at gmail.com
Fri Jul 26 13:12:04 UTC 2024
On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri at amd.com> wrote:
>
> Problem:
> IP dump right now is done post suspend of
> all IP's which for some IP's could change power
> state and software state too which we do not want
> to reflect in the dump as it might not be same at
> the time of hang.
>
> Solution:
> IP should be dumped as close to the HW state when
> the GPU was in hung state without trying to reinitialize
> any resource.
>
> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
Acked-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++-----------
> 1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 730dae77570c..74f6f15e73b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
> return ret;
> }
>
> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> +{
> + int i;
> +
> + lockdep_assert_held(&adev->reset_domain->sem);
> +
> + for (i = 0; i < adev->reset_info.num_regs; i++) {
> + adev->reset_info.reset_dump_reg_value[i] =
> + RREG32(adev->reset_info.reset_dump_reg_list[i]);
> +
> + trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> + adev->reset_info.reset_dump_reg_value[i]);
> + }
> +
> + return 0;
> +}
> +
> int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> struct amdgpu_reset_context *reset_context)
> {
> int i, r = 0;
> struct amdgpu_job *job = NULL;
> + struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
> bool need_full_reset =
> test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
>
> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> }
> }
>
> + if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> + amdgpu_reset_reg_dumps(tmp_adev);
> +
> + dev_info(tmp_adev->dev, "Dumping IP State\n");
> + /* Trigger ip dump before we reset the asic */
> + for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> + if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> + tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
> + (void *)tmp_adev);
> + dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> + }
> +
> if (need_full_reset)
> r = amdgpu_device_ip_suspend(adev);
> if (need_full_reset)
> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
> return r;
> }
>
> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> -{
> - int i;
> -
> - lockdep_assert_held(&adev->reset_domain->sem);
> -
> - for (i = 0; i < adev->reset_info.num_regs; i++) {
> - adev->reset_info.reset_dump_reg_value[i] =
> - RREG32(adev->reset_info.reset_dump_reg_list[i]);
> -
> - trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> - adev->reset_info.reset_dump_reg_value[i]);
> - }
> -
> - return 0;
> -}
> -
> int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> struct amdgpu_reset_context *reset_context)
> {
> struct amdgpu_device *tmp_adev = NULL;
> bool need_full_reset, skip_hw_reset, vram_lost = false;
> int r = 0;
> - uint32_t i;
>
> /* Try reset handler method first */
> tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
> reset_list);
>
> - if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> - amdgpu_reset_reg_dumps(tmp_adev);
> -
> - dev_info(tmp_adev->dev, "Dumping IP State\n");
> - /* Trigger ip dump before we reset the asic */
> - for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> - if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> - tmp_adev->ip_blocks[i].version->funcs
> - ->dump_ip_state((void *)tmp_adev);
> - dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> - }
> -
> reset_context->reset_device_list = device_list_handle;
> r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
> /* If reset handler not implemented, continue; otherwise return */
> --
> 2.34.1
>
More information about the amd-gfx
mailing list