[PATCH v2] drm/amdgpu: Fix error handling in amdgpu_ras_add_bad_pages

Thu Jan 2 06:01:12 UTC 2025

Ping!?

On 12/17/2024 3:08 PM, Srinivasan Shanmugam wrote:
> It ensures that appropriate error codes are returned when an error
> condition is detected
>
> Fixes the below;
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:2849 amdgpu_ras_add_bad_pages() warn: missing error code here? 'amdgpu_umc_pages_in_a_row()' failed.
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:2884 amdgpu_ras_add_bad_pages() warn: missing error code here? 'amdgpu_ras_mca2pa()' failed.
>
> Fixes: 9fe61c21405a ("drm/amdgpu: parse legacy RAS bad page mixed with new data in various NPS modes")
> Reported-by: Dan Carpenter <dan.carpenter at linaro.org>
> Cc: YiPeng Chai <yipeng.chai at amd.com>
> Cc: Tao Zhou <tao.zhou1 at amd.com>
> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam at amd.com>
> ---
> v2:
>   - s/-EIO/-EINVAL, retained the use of -EINVAL from
>     amdgpu_umc_pages_in_a_row & and amdgpu_ras_mca2pa_by_idx, when the
>     RAS context is not initialized or the convert_ras_err_addr function is
>     unavailable. (Thomas)
>
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 ++++++++++++++++-----
>   1 file changed, 16 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 01c947066a2e..f1371d1f8421 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2832,8 +2832,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   
>   	mutex_lock(&con->recovery_lock);
>   	data = con->eh_data;
> -	if (!data)
> +	if (!data) {
> +		ret = -EINVAL;
>   		goto free;
> +	}
>   
>   	for (i = 0; i < pages; i++) {
>   		if (from_rom &&
> @@ -2845,26 +2847,34 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   						 * one row
>   						 */
>   						if (amdgpu_umc_pages_in_a_row(adev, &err_data,
> -								bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
> +									      bps[i].retired_page <<
> +									      AMDGPU_GPU_PAGE_SHIFT)) {
> +							ret = -EINVAL;
>   							goto free;
> -						else
> +						} else {
>   							find_pages_per_pa = true;
> +						}
>   					} else {
>   						/* unsupported cases */
> +						ret = -EOPNOTSUPP;
>   						goto free;
>   					}
>   				}
>   			} else {
>   				if (amdgpu_umc_pages_in_a_row(adev, &err_data,
> -						bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
> +						bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) {
> +					ret = -EINVAL;
>   					goto free;
> +				}
>   			}
>   		} else {
>   			if (from_rom && !find_pages_per_pa) {
>   				if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
>   					/* bad page in any NPS mode in eeprom */
> -					if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data))
> +					if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) {
> +						ret = -EINVAL;
>   						goto free;
> +					}
>   				} else {
>   					/* legacy bad page in eeprom, generated only in
>   					 * NPS1 mode
> @@ -2881,6 +2891,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   							/* non-nps1 mode, old RAS TA
>   							 * can't support it
>   							 */
> +							ret = -EOPNOTSUPP;
>   							goto free;
>   						}
>   					}