[PATCH 1/2] drm/amdgpu: Add init level for post reset reinit
Zhou1, Tao
Tao.Zhou1 at amd.com
Tue Nov 19 06:49:39 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
The series is: Acked-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar at amd.com>
> Sent: Friday, November 15, 2024 4:04 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: Add init level for post reset reinit
>
> When device needs to be reset before initialization, it's not required for all IPs to be
> initialized before a reset. In such cases, it needs to identify whether the IP/feature is
> initialized for the first time or whether it's reinitialized after a reset.
>
> Add RESET_RECOVERY init level to identify post reset reinitialization phase. This
> only provides a device level identification, IP/features may choose to track their state
> independently also.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/aldebaran.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 5 +++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c | 2 ++
> drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 ++
> 7 files changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 6a2fd9e4f470..57c1ca055388 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -330,6 +330,8 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
> }
>
> list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
> + amdgpu_set_init_level(tmp_adev,
> + AMDGPU_INIT_LEVEL_RESET_RECOVERY);
> dev_info(tmp_adev->dev,
> "GPU reset succeeded, trying to resume\n");
> /*TBD: Ideally should clear only GFX, SDMA blocks*/ @@ -377,6
> +379,8 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control
> *reset_ctl,
> tmp_adev);
>
> if (!r) {
> + amdgpu_set_init_level(tmp_adev,
> + AMDGPU_INIT_LEVEL_DEFAULT);
> amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
>
> r = amdgpu_ib_ring_tests(tmp_adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4f72ad4e843f..b8ef89d64704 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -846,6 +846,7 @@ struct amdgpu_mqd {
> enum amdgpu_init_lvl_id {
> AMDGPU_INIT_LEVEL_DEFAULT,
> AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
> + AMDGPU_INIT_LEVEL_RESET_RECOVERY,
> };
>
> struct amdgpu_init_level {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0419b37e75a8..415c469c2d80 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -155,6 +155,11 @@ struct amdgpu_init_level amdgpu_init_default = {
> .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, };
>
> +struct amdgpu_init_level amdgpu_init_recovery = {
> + .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY,
> + .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, };
> +
> /*
> * Minimal blocks needed to be initialized before a XGMI hive can be reset. This
> * is used for cases like reset on initialization where the entire hive needs to @@ -
> 181,6 +186,9 @@ void amdgpu_set_init_level(struct amdgpu_device *adev,
> case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
> adev->init_lvl = &amdgpu_init_minimal_xgmi;
> break;
> + case AMDGPU_INIT_LEVEL_RESET_RECOVERY:
> + adev->init_lvl = &amdgpu_init_recovery;
> + break;
> case AMDGPU_INIT_LEVEL_DEFAULT:
> fallthrough;
> default:
> @@ -5445,7 +5453,7 @@ int amdgpu_device_reinit_after_reset(struct
> amdgpu_reset_context *reset_context)
> struct list_head *device_list_handle;
> bool full_reset, vram_lost = false;
> struct amdgpu_device *tmp_adev;
> - int r;
> + int r, init_level;
>
> device_list_handle = reset_context->reset_device_list;
>
> @@ -5454,10 +5462,17 @@ int amdgpu_device_reinit_after_reset(struct
> amdgpu_reset_context *reset_context)
>
> full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
>
> + /**
> + * If it's reset on init, it's default init level, otherwise keep level
> + * as recovery level.
> + */
> + if (reset_context->method == AMD_RESET_METHOD_ON_INIT)
> + init_level = AMDGPU_INIT_LEVEL_DEFAULT;
> + else
> + init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY;
> r = 0;
> list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
> - /* After reset, it's default init level */
> - amdgpu_set_init_level(tmp_adev,
> AMDGPU_INIT_LEVEL_DEFAULT);
> + amdgpu_set_init_level(tmp_adev, init_level);
> if (full_reset) {
> /* post card */
> amdgpu_ras_clear_err_state(tmp_adev);
> @@ -5544,6 +5559,9 @@ int amdgpu_device_reinit_after_reset(struct
> amdgpu_reset_context *reset_context)
>
> out:
> if (!r) {
> + /* IP init is complete now, set level as default */
> + amdgpu_set_init_level(tmp_adev,
> + AMDGPU_INIT_LEVEL_DEFAULT);
> amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
> r = amdgpu_ib_ring_tests(tmp_adev);
> if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index 4fc0ee01d56b..59a29fa12db3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -343,3 +343,8 @@ void amdgpu_reset_get_desc(struct amdgpu_reset_context
> *rst_ctxt, char *buf,
> strscpy(buf, "unknown", len);
> }
> }
> +
> +bool amdgpu_reset_in_recovery(struct amdgpu_device *adev) {
> + return (adev->init_lvl->level ==
> AMDGPU_INIT_LEVEL_RESET_RECOVERY);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index f8628bc898df..4d9b9701139b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -158,4 +158,6 @@ extern struct amdgpu_reset_handler
> xgmi_reset_on_init_handler; int amdgpu_reset_do_xgmi_reset_on_init(
> struct amdgpu_reset_context *reset_context);
>
> +bool amdgpu_reset_in_recovery(struct amdgpu_device *adev);
> +
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> index 9b01e074af47..2594467bdd87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
> @@ -220,6 +220,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
> int r;
> struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl-
> >handle;
>
> + amdgpu_set_init_level(tmp_adev,
> AMDGPU_INIT_LEVEL_RESET_RECOVERY);
> dev_info(tmp_adev->dev,
> "GPU reset succeeded, trying to resume\n");
> r = sienna_cichlid_mode2_restore_ip(tmp_adev);
> @@ -237,6 +238,7 @@ sienna_cichlid_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
>
> amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
>
> + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
> r = amdgpu_ib_ring_tests(tmp_adev);
> if (r) {
> dev_err(tmp_adev->dev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> index e70ebad3f9fa..70569ea906bc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
> @@ -221,6 +221,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
> int r;
> struct amdgpu_device *tmp_adev = (struct amdgpu_device *)reset_ctl-
> >handle;
>
> + amdgpu_set_init_level(tmp_adev,
> AMDGPU_INIT_LEVEL_RESET_RECOVERY);
> dev_info(tmp_adev->dev,
> "GPU reset succeeded, trying to resume\n");
> r = smu_v13_0_10_mode2_restore_ip(tmp_adev);
> @@ -234,6 +235,7 @@ smu_v13_0_10_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
>
> amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
>
> + amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
> r = amdgpu_ib_ring_tests(tmp_adev);
> if (r) {
> dev_err(tmp_adev->dev,
> --
> 2.25.1
More information about the amd-gfx
mailing list