[PATCH 1/4] drm/amdgpu: check if vram is lost v2

Christian König deathsimple at vodafone.de
Tue May 16 10:49:06 UTC 2017


Am 16.05.2017 um 11:25 schrieb Chunming Zhou:
> bakup first 64 byte of gart table as reset magic, check if magic is same
> after gpu hw reset.
> v2: use memcmp instead of manual innovation.
>
> Change-Id: I9a73720da4084ea8677c3031dfb62e8157ee5704
> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>

Patch #1-#3 are Reviewed-by: Christian König <christian.koenig at amd.com>

Patch #4:

You need to add the new enum on line 591 or otherwise you will get an 
"unsupported operation" error.

Line 604 should be changed as well or otherwise we need a BO for this 
operation.

A libdrm test case to just call this IOCTL would probably be a good idea.

Additional to that I would ping Marek (Mesa) and Michel (DDX) for their 
opinion on this. Could be that this is completely superfluous and the 
UMDs needs something else.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++-
>   2 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index de08ff0..f9da215 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1502,6 +1502,7 @@ struct amdgpu_ssg {
>   #endif
>   };
>   
> +#define AMDGPU_RESET_MAGIC_NUM 64
>   struct amdgpu_device {
>   	struct device			*dev;
>   	struct drm_device		*ddev;
> @@ -1705,6 +1706,7 @@ struct amdgpu_device {
>   
>   	/* record hw reset is performed */
>   	bool has_hw_reset;
> +	u8				reset_magic[AMDGPU_RESET_MAGIC_NUM];
>   
>   };
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0a31fb1..c56ae4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1685,6 +1685,17 @@ static int amdgpu_init(struct amdgpu_device *adev)
>   	return 0;
>   }
>   
> +static void amdgpu_fill_reset_magic(struct amdgpu_device *adev)
> +{
> +	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
> +}
> +
> +static bool amdgpu_check_vram_lost(struct amdgpu_device *adev)
> +{
> +	return !!memcmp(adev->gart.ptr, adev->reset_magic,
> +			AMDGPU_RESET_MAGIC_NUM);
> +}
> +
>   static int amdgpu_late_init(struct amdgpu_device *adev)
>   {
>   	int i = 0, r;
> @@ -1715,6 +1726,8 @@ static int amdgpu_late_init(struct amdgpu_device *adev)
>   		}
>   	}
>   
> +	amdgpu_fill_reset_magic(adev);
> +
>   	return 0;
>   }
>   
> @@ -2830,7 +2843,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	struct drm_atomic_state *state = NULL;
>   	int i, r;
>   	int resched;
> -	bool need_full_reset;
> +	bool need_full_reset, vram_lost = false;
>   
>   	if (amdgpu_sriov_vf(adev))
>   		return amdgpu_sriov_gpu_reset(adev, true);
> @@ -2899,12 +2912,17 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   			r = amdgpu_resume_phase1(adev);
>   			if (r)
>   				goto out;
> +			vram_lost = amdgpu_check_vram_lost(adev);
> +			if (vram_lost)
> +				DRM_ERROR("VRAM is lost!\n");
>   			r = amdgpu_ttm_recover_gart(adev);
>   			if (r)
>   				goto out;
>   			r = amdgpu_resume_phase2(adev);
>   			if (r)
>   				goto out;
> +			if (vram_lost)
> +				amdgpu_fill_reset_magic(adev);
>   		}
>   	}
>   out:




More information about the amd-gfx mailing list