[PATCH] drm/amdgpu: revert VRAM lost handling

Nicolai Hähnle nhaehnle at gmail.com
Mon Oct 9 15:16:15 UTC 2017


On 09.10.2017 10:16, Christian König wrote:
> From: Christian König <christian.koenig at amd.com>
> 
> Revert "drm/amdgpu: skip all jobs of guilty vm" and
> "drm/amdgpu: return -ENODEV to user space when vram is lost v2"
> 
> Forcing userspace to restart without a chance to recover in case of a GPU reset
> doesn't make much sense and just completely breaks GPU reset handling and makes
> the system unuseable after a reset.
> 
> Signed-off-by: Christian König <christian.koenig at amd.com>

Acked-by: Nicolai Hähnle <nicolai.haehnle at amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 14 --------------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  5 -----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 15 ++++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ----------
>   6 files changed, 5 insertions(+), 47 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71e971f..81dd5ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -772,7 +772,6 @@ struct amdgpu_fpriv {
>   	struct mutex		bo_list_lock;
>   	struct idr		bo_list_handles;
>   	struct amdgpu_ctx_mgr	ctx_mgr;
> -	u32			vram_lost_counter;
>   };
>   
>   /*
> @@ -1501,7 +1500,6 @@ struct amdgpu_device {
>   	atomic64_t			num_evictions;
>   	atomic64_t			num_vram_cpu_page_faults;
>   	atomic_t			gpu_reset_counter;
> -	atomic_t			vram_lost_counter;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -1845,8 +1843,6 @@ static inline bool amdgpu_has_atpx(void) { return false; }
>   extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
>   extern const int amdgpu_max_kms_ioctl;
>   
> -bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> -			  struct amdgpu_fpriv *fpriv);
>   int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
>   void amdgpu_driver_unload_kms(struct drm_device *dev);
>   void amdgpu_driver_lastclose_kms(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ab83dfc..adb0c1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1189,7 +1189,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> -	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	union drm_amdgpu_cs *cs = data;
>   	struct amdgpu_cs_parser parser = {};
>   	bool reserved_buffers = false;
> @@ -1197,8 +1196,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   
>   	if (!adev->accel_working)
>   		return -EBUSY;
> -	if (amdgpu_kms_vram_lost(adev, fpriv))
> -		return -ENODEV;
>   
>   	parser.adev = adev;
>   	parser.filp = filp;
> @@ -1257,16 +1254,12 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
>   {
>   	union drm_amdgpu_wait_cs *wait = data;
>   	struct amdgpu_device *adev = dev->dev_private;
> -	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
>   	struct amdgpu_ring *ring = NULL;
>   	struct amdgpu_ctx *ctx;
>   	struct dma_fence *fence;
>   	long r;
>   
> -	if (amdgpu_kms_vram_lost(adev, fpriv))
> -		return -ENODEV;
> -
>   	ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id);
>   	if (ctx == NULL)
>   		return -EINVAL;
> @@ -1335,16 +1328,12 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data,
>   				    struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> -	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	union drm_amdgpu_fence_to_handle *info = data;
>   	struct dma_fence *fence;
>   	struct drm_syncobj *syncobj;
>   	struct sync_file *sync_file;
>   	int fd, r;
>   
> -	if (amdgpu_kms_vram_lost(adev, fpriv))
> -		return -ENODEV;
> -
>   	fence = amdgpu_cs_get_fence(adev, filp, &info->in.fence);
>   	if (IS_ERR(fence))
>   		return PTR_ERR(fence);
> @@ -1506,15 +1495,12 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
>   				struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> -	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	union drm_amdgpu_wait_fences *wait = data;
>   	uint32_t fence_count = wait->in.fence_count;
>   	struct drm_amdgpu_fence *fences_user;
>   	struct drm_amdgpu_fence *fences;
>   	int r;
>   
> -	if (amdgpu_kms_vram_lost(adev, fpriv))
> -		return -ENODEV;
>   	/* Get the fences from userspace */
>   	fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
>   			GFP_KERNEL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 24f6e3c..6d641e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2951,10 +2951,8 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   			if (r)
>   				goto out;
>   			vram_lost = amdgpu_check_vram_lost(adev);
> -			if (vram_lost) {
> +			if (vram_lost)
>   				DRM_ERROR("VRAM is lost!\n");
> -				atomic_inc(&adev->vram_lost_counter);
> -			}
>   			r = amdgpu_ttm_recover_gart(adev);
>   			if (r)
>   				goto out;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index b0d45c8..c69048c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -577,11 +577,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
>   			args->operation);
>   		return -EINVAL;
>   	}
> -	if ((args->operation == AMDGPU_VA_OP_MAP) ||
> -	    (args->operation == AMDGPU_VA_OP_REPLACE)) {
> -		if (amdgpu_kms_vram_lost(adev, fpriv))
> -			return -ENODEV;
> -	}
>   
>   	INIT_LIST_HEAD(&list);
>   	INIT_LIST_HEAD(&duplicates);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 4510627..63bd372 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -178,7 +178,6 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   {
>   	struct dma_fence *fence = NULL;
>   	struct amdgpu_job *job;
> -	struct amdgpu_fpriv *fpriv = NULL;
>   	int r;
>   
>   	if (!sched_job) {
> @@ -190,16 +189,10 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>   
>   	trace_amdgpu_sched_run_job(job);
> -	if (job->vm)
> -		fpriv = container_of(job->vm, struct amdgpu_fpriv, vm);
> -	/* skip ib schedule when vram is lost */
> -	if (fpriv && amdgpu_kms_vram_lost(job->adev, fpriv))
> -		DRM_ERROR("Skip scheduling IBs!\n");
> -	else {
> -		r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence);
> -		if (r)
> -			DRM_ERROR("Error scheduling IBs (%d)\n", r);
> -	}
> +	r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence);
> +	if (r)
> +		DRM_ERROR("Error scheduling IBs (%d)\n", r);
> +
>   	/* if gpu reset, hw fence will be replaced here */
>   	dma_fence_put(job->fence);
>   	job->fence = dma_fence_get(fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 8c6fd56..0fc36b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -269,7 +269,6 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
>   static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   {
>   	struct amdgpu_device *adev = dev->dev_private;
> -	struct amdgpu_fpriv *fpriv = filp->driver_priv;
>   	struct drm_amdgpu_info *info = data;
>   	struct amdgpu_mode_info *minfo = &adev->mode_info;
>   	void __user *out = (void __user *)(uintptr_t)info->return_pointer;
> @@ -282,8 +281,6 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
>   
>   	if (!info->return_size || !info->return_pointer)
>   		return -EINVAL;
> -	if (amdgpu_kms_vram_lost(adev, fpriv))
> -		return -ENODEV;
>   
>   	switch (info->query) {
>   	case AMDGPU_INFO_ACCEL_WORKING:
> @@ -791,12 +788,6 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev)
>   	vga_switcheroo_process_delayed_switch();
>   }
>   
> -bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> -			  struct amdgpu_fpriv *fpriv)
> -{
> -	return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter);
> -}
> -
>   /**
>    * amdgpu_driver_open_kms - drm callback for open
>    *
> @@ -853,7 +844,6 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
>   
>   	amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
>   
> -	fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>   	file_priv->driver_priv = fpriv;
>   
>   out_suspend:
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the amd-gfx mailing list