[PATCH 1/3] drm/amdgpu: implement RAS interrupt handler for poison creation

Lazar, Lijo lijo.lazar at amd.com
Wed Apr 20 04:09:55 UTC 2022



On 4/20/2022 9:23 AM, Tao Zhou wrote:
> Prepare for the implementation of poison consumption handler.
> 
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 ++++++++++++++-----------
>   1 file changed, 35 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..2d066cff70ea 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1515,12 +1515,44 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
>   /* ras fs end */
>   
>   /* ih begin */
> +static void ras_interrupt_poison_creation_handler(struct ras_manager *obj,
> +				struct amdgpu_iv_entry *entry)
> +{
> +	struct ras_ih_data *data = &obj->ih_data;
> +	struct ras_err_data err_data = {0, 0, 0, NULL};
> +	int ret;
> +
> +	if (!data->cb)
> +		return;
> +
> +	if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
> +	    obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> +		dev_info(obj->adev->dev,
> +			"Poison is created, no user action is needed.\n");
> +	else {

It is confusing to handle RAS interrupts under poison_creation_handler 
in non-poison mode.

Thanks,
Lijo

> +		/* Let IP handle its data, maybe we need get the output
> +		 * from the callback to update the error type/count, etc
> +		 */
> +		ret = data->cb(obj->adev, &err_data, entry);
> +		/* ue will trigger an interrupt, and in that case
> +		 * we need do a reset to recovery the whole system.
> +		 * But leave IP do that recovery, here we just dispatch
> +		 * the error.
> +		 */
> +		if (ret == AMDGPU_RAS_SUCCESS) {
> +			/* these counts could be left as 0 if
> +			 * some blocks do not count error number
> +			 */
> +			obj->err_data.ue_count += err_data.ue_count;
> +			obj->err_data.ce_count += err_data.ce_count;
> +		}
> +	}
> +}
> +
>   static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
>   {
>   	struct ras_ih_data *data = &obj->ih_data;
>   	struct amdgpu_iv_entry entry;
> -	int ret;
> -	struct ras_err_data err_data = {0, 0, 0, NULL};
>   
>   	while (data->rptr != data->wptr) {
>   		rmb();
> @@ -1531,31 +1563,7 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
>   		data->rptr = (data->aligned_element_size +
>   				data->rptr) % data->ring_size;
>   
> -		if (data->cb) {
> -			if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
> -			    obj->head.block == AMDGPU_RAS_BLOCK__UMC)
> -				dev_info(obj->adev->dev,
> -						"Poison is created, no user action is needed.\n");
> -			else {
> -				/* Let IP handle its data, maybe we need get the output
> -				 * from the callback to udpate the error type/count, etc
> -				 */
> -				memset(&err_data, 0, sizeof(err_data));
> -				ret = data->cb(obj->adev, &err_data, &entry);
> -				/* ue will trigger an interrupt, and in that case
> -				 * we need do a reset to recovery the whole system.
> -				 * But leave IP do that recovery, here we just dispatch
> -				 * the error.
> -				 */
> -				if (ret == AMDGPU_RAS_SUCCESS) {
> -					/* these counts could be left as 0 if
> -					 * some blocks do not count error number
> -					 */
> -					obj->err_data.ue_count += err_data.ue_count;
> -					obj->err_data.ce_count += err_data.ce_count;
> -				}
> -			}
> -		}
> +		ras_interrupt_poison_creation_handler(obj, &entry);
>   	}
>   }
>   
> 


More information about the amd-gfx mailing list