[PATCH 05/10] drm/amdgpu: Add helper to initialize badpage info
Alex Deucher
alexdeucher at gmail.com
Mon Sep 9 21:29:40 UTC 2024
On Mon, Sep 2, 2024 at 3:53 AM Lijo Lazar <lijo.lazar at amd.com> wrote:
>
> Add a separate function to read badpage data during initialization.
> Reading bad pages will need hardware access and cannot be done during
> reset. Hence in cases where device needs a full reset during
> init itself, attempting to read will cause a deadlock.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 +++++++++++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +-
> 3 files changed, 41 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e28227869307..468c4f590183 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2945,7 +2945,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
> * Note: theoretically, this should be called before all vram allocations
> * to protect retired page from abusing
> */
> - r = amdgpu_ras_recovery_init(adev);
> + r = amdgpu_ras_recovery_init(adev, true);
> if (r)
> goto init_failed;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2076f157cb6a..65c891b6b999 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param)
> return 0;
> }
>
> -int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> +int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + int ret;
> +
> + if (!con || amdgpu_sriov_vf(adev))
> + return 0;
> +
> + ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> +
> + if (ret)
> + return ret;
> +
> + /* HW not usable */
> + if (amdgpu_ras_is_rma(adev))
> + return -EHWPOISON;
> +
> + if (con->eeprom_control.ras_num_recs) {
> + ret = amdgpu_ras_load_bad_pages(adev);
> + if (ret)
> + return ret;
> +
> + amdgpu_dpm_send_hbm_bad_pages_num(
> + adev, con->eeprom_control.ras_num_recs);
> +
> + if (con->update_channel_flag == true) {
> + amdgpu_dpm_send_hbm_bad_channel_flag(
> + adev, con->eeprom_control.bad_channel_bitmap);
> + con->update_channel_flag = false;
> + }
> + }
> +
> + return ret;
> +}
> +
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
> {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> struct ras_err_handler_data **data;
> @@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> */
> if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL)
> return 0;
> - ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
> - /*
> - * This calling fails when is_rma is true or
> - * ret != 0.
> - */
> - if (amdgpu_ras_is_rma(adev) || ret)
> - goto free;
> -
> - if (con->eeprom_control.ras_num_recs) {
> - ret = amdgpu_ras_load_bad_pages(adev);
> + if (init_bp_info) {
> + ret = amdgpu_ras_init_badpage_info(adev);
> if (ret)
> goto free;
> -
> - amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
> -
> - if (con->update_channel_flag == true) {
> - amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
> - con->update_channel_flag = false;
> - }
> }
>
> mutex_init(&con->page_rsv_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 669720a9c60a..871b2d6278e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops {
> * 8: feature disable
> */
>
> -
> -int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> +int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);
>
> void amdgpu_ras_resume(struct amdgpu_device *adev);
> void amdgpu_ras_suspend(struct amdgpu_device *adev);
> --
> 2.25.1
>
More information about the amd-gfx
mailing list