[PATCH 2/3] drm/amdgpu: return -ENODEV to user space when vram is lost

Christian König deathsimple at vodafone.de
Tue May 16 08:13:13 UTC 2017


Am 16.05.2017 um 08:44 schrieb Chunming Zhou:
> below ioctl will return -ENODEV:
> amdgpu_cs_ioctl
> amdgpu_cs_wait_ioctl
> amdgpu_cs_wait_fences_ioctl
> amdgpu_gem_va_ioctl
> amdgpu_info_ioctl
>
> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
>   5 files changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f9da215..dcd6203 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -855,6 +855,7 @@ struct amdgpu_fpriv {
>   	struct amdgpu_ctx_mgr	ctx_mgr;
>   	spinlock_t		sem_handles_lock;
>   	struct idr		sem_handles;
> +	u32			vram_lost_counter;
>   };
>   
>   /*
> @@ -1607,6 +1608,7 @@ struct amdgpu_device {
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
> +	atomic_t			vram_lost_counter;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -2005,6 +2007,8 @@ static inline void amdgpu_unregister_atpx_handler(void) {}
>   extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
>   extern const int amdgpu_max_kms_ioctl;
>   
> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> +			  struct amdgpu_fpriv *fpriv);
>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
>   int amdgpu_driver_unload_kms(struct drm_device *dev);
>   void amdgpu_driver_lastclose_kms(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index b803412..911aa02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	union drm_amdgpu_cs *cs = data;
>   	struct amdgpu_cs_parser parser = {};
>   	bool reserved_buffers = false;
> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   
>   	if (!adev->accel_working)
>   		return -EBUSY;
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
>   
>   	parser.adev = adev;
>   	parser.filp = filp;
> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
>   {
>   	union drm_amdgpu_wait_cs *wait = data;
>   	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
>   	struct amdgpu_ring *ring = NULL;
>   	struct amdgpu_ctx *ctx;
>   	struct fence *fence;
>   	long r;
>   
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
>   	r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance,
>   			       wait->in.ring, &ring);
>   	if (r)
> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
>   				struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	union drm_amdgpu_wait_fences *wait = data;
>   	uint32_t fence_count = wait->in.fence_count;
>   	struct drm_amdgpu_fence *fences_user;
>   	struct drm_amdgpu_fence *fences;
>   	int r;
>   
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
>   	/* Get the fences from userspace */
>   	fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
>   			GFP_KERNEL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 58af9ea..417b8f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2919,8 +2919,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   			if (r)
>   				goto out;
>   			vram_lost = amdgpu_check_vram_lost(adev);
> -			if (vram_lost)
> +			if (vram_lost) {
>   				DRM_ERROR("VRAM is lost!\n");
> +				atomic_inc(&adev->vram_lost_counter);
> +			}
>   			r = amdgpu_ttm_recover_gart(adev);
>   			if (r)
>   				goto out;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index d8275ef..0f0b736 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -776,6 +776,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>   
>   	if (!adev->vm_manager.enabled)
>   		return -ENOTTY;
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;

We should only block AMDGPU_VA_OP_MAP and AMDGPU_VA_OP_REPLACE here and 
still allow AMDGPU_VA_OP_UNMAP and AMDGPU_VA_OP_CLEAR.

BTW: How should the UMD recover from that situation? Completely close 
the fd and recreate it?

That might be tricky for processes like X or the Compositor. Should we 
have an IOCTL to reset the vram_lost counter for an fd?

Christian.

>   
>   	if (args->va_address < AMDGPU_VA_RESERVED_SIZE) {
>   		dev_err(&dev->pdev->dev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 368829a..a231aa1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
>   static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> +	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	struct drm_amdgpu_info *info = data;
>   	struct amdgpu_mode_info *minfo = &adev->mode_info;
>   	void __user *out = (void __user *)(uintptr_t)info->return_pointer;
> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
>   
>   	if (!info->return_size || !info->return_pointer)
>   		return -EINVAL;
> +	if (amdgpu_kms_vram_lost(adev, fpriv))
> +		return -ENODEV;
>   
>   	switch (info->query) {
>   	case AMDGPU_INFO_VIRTUAL_RANGE: {
> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev)
>   	vga_switcheroo_process_delayed_switch();
>   }
>   
> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> +			  struct amdgpu_fpriv *fpriv)
> +{
> +	return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter);
> +}
> +
>   /**
>    * amdgpu_driver_open_kms - drm callback for open
>    *
> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
>   
>   	amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
>   
> +	fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>   	file_priv->driver_priv = fpriv;
>   
>   out_suspend:




More information about the amd-gfx mailing list