[PATCH] drm/amdkfd: Select reset method for poison handling

Lazar, Lijo lijo.lazar at amd.com
Fri Sep 6 09:47:24 UTC 2024



On 9/6/2024 1:42 PM, Hawking Zhang wrote:
> Driver mode-2 is only supported by relative new
> smc firmware.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 40 +++++++++++++++----
>  1 file changed, 32 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> index fecdbbab9894..d46a13156ee9 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
> @@ -167,11 +167,23 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>  	case SOC15_IH_CLIENTID_SE3SH:
>  	case SOC15_IH_CLIENTID_UTCL2:
>  		block = AMDGPU_RAS_BLOCK__GFX;
> -		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> -			amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))
> -			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> -		else
> +		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) {
> +			/* driver mode-2 for gfx poison is only supported by
> +			 * pmfw 0x00557300 and onwards */
> +			if (dev->adev->pm.fw_version < 0x00557300)
> +				reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> +			else
> +				reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +		} else if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) {
> +			/* driver mode-2 for gfx poison is only supported by
> +			 * pmfw 0x05550C00 and onwards */
> +			if (dev->adev->pm.fw_version < 0x05550C00)
> +				reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> +			else
> +				reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +		} else {
>  			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +		}

I think it's better to handle this inside amdgpu_ras_do_recovery rather
than here.

Something like -
	int amdgpu_ras_reset_method_quirk(adev) which returns the right reset
method when (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) is
set. Or add a few more flags like RAS_SDMA_POISON/RAS_GFX_POISON and
decide the method in amdgpu_ras handling.

Thanks,
Lijo


>  		break;
>  	case SOC15_IH_CLIENTID_VMC:
>  	case SOC15_IH_CLIENTID_VMC1:
> @@ -184,11 +196,23 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>  	case SOC15_IH_CLIENTID_SDMA3:
>  	case SOC15_IH_CLIENTID_SDMA4:
>  		block = AMDGPU_RAS_BLOCK__SDMA;
> -		if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> -			amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))
> -			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> -		else
> +		if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2)) {
> +			/* driver mode-2 for gfx poison is only supported by
> +			 * pmfw 0x00557300 and onwards */
> +			if (dev->adev->pm.fw_version < 0x00557300)
> +				reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> +			else
> +				reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +		} else if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5)) {
> +			/* driver mode-2 for gfx poison is only supported by
> +			 * pmfw 0x05550C00 and onwards */
> +			if (dev->adev->pm.fw_version < 0x05550C00)
> +				reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> +			else
> +				reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +		} else {
>  			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
> +		}
>  		break;
>  	default:
>  		dev_warn(dev->adev->dev,


More information about the amd-gfx mailing list