[PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

Tue Feb 20 14:20:31 UTC 2024

Am 19.02.24 um 09:15 schrieb Tao Zhou:
> Let kfd interrupt handler process it.
>
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +++++++++-
>   1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 773725a92cf1..70defc394b7b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   {
>   	bool retry_fault = !!(entry->src_data[1] & 0x80);
>   	bool write_fault = !!(entry->src_data[1] & 0x20);
> -	uint32_t status = 0, cid = 0, rw = 0;
> +	uint32_t status = 0, cid = 0, rw = 0, fed = 0;
>   	struct amdgpu_task_info task_info;
>   	struct amdgpu_vmhub *hub;
>   	const char *mmhub_cid;
> @@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   	status = RREG32(hub->vm_l2_pro_fault_status);
>   	cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
>   	rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
> +	fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
> +
> +	/* for gfx fed error, kfd will handle it, return directly */
> +	if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
> +	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) &&
> +	    !strcmp(hub_name, "gfxhub0"))

Please never ever use strcmp() to make a decision like that, 
*especially* not in an interrupt handler.

Regards,
Christian.

> +		return 1;
> +
>   	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   #ifdef HAVE_STRUCT_XARRAY
>   	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);