[PATCH] drm/amdgpu: revert VRAM lost handling
Nicolai Hähnle
nhaehnle at gmail.com
Mon Oct 9 15:16:15 UTC 2017
On 09.10.2017 10:16, Christian König wrote:
> From: Christian König <christian.koenig at amd.com>
>
> Revert "drm/amdgpu: skip all jobs of guilty vm" and
> "drm/amdgpu: return -ENODEV to user space when vram is lost v2"
>
> Forcing userspace to restart without a chance to recover in case of a GPU reset
> doesn't make much sense and just completely breaks GPU reset handling and makes
> the system unuseable after a reset.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
Acked-by: Nicolai Hähnle <nicolai.haehnle at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ----
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 --------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 5 -----
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 ++++-----------
> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ----------
> 6 files changed, 5 insertions(+), 47 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71e971f..81dd5ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -772,7 +772,6 @@ struct amdgpu_fpriv {
> struct mutex bo_list_lock;
> struct idr bo_list_handles;
> struct amdgpu_ctx_mgr ctx_mgr;
> - u32 vram_lost_counter;
> };
>
> /*
> @@ -1501,7 +1500,6 @@ struct amdgpu_device {
> atomic64_t num_evictions;
> atomic64_t num_vram_cpu_page_faults;
> atomic_t gpu_reset_counter;
> - atomic_t vram_lost_counter;
>
> /* data for buffer migration throttling */
> struct {
> @@ -1845,8 +1843,6 @@ static inline bool amdgpu_has_atpx(void) { return false; }
> extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
> extern const int amdgpu_max_kms_ioctl;
>
> -bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> - struct amdgpu_fpriv *fpriv);
> int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
> void amdgpu_driver_unload_kms(struct drm_device *dev);
> void amdgpu_driver_lastclose_kms(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index ab83dfc..adb0c1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1189,7 +1189,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
> int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> {
> struct amdgpu_device *adev = dev->dev_private;
> - struct amdgpu_fpriv *fpriv = filp->driver_priv;
> union drm_amdgpu_cs *cs = data;
> struct amdgpu_cs_parser parser = {};
> bool reserved_buffers = false;
> @@ -1197,8 +1196,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>
> if (!adev->accel_working)
> return -EBUSY;
> - if (amdgpu_kms_vram_lost(adev, fpriv))
> - return -ENODEV;
>
> parser.adev = adev;
> parser.filp = filp;
> @@ -1257,16 +1254,12 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
> {
> union drm_amdgpu_wait_cs *wait = data;
> struct amdgpu_device *adev = dev->dev_private;
> - struct amdgpu_fpriv *fpriv = filp->driver_priv;
> unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
> struct amdgpu_ring *ring = NULL;
> struct amdgpu_ctx *ctx;
> struct dma_fence *fence;
> long r;
>
> - if (amdgpu_kms_vram_lost(adev, fpriv))
> - return -ENODEV;
> -
> ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id);
> if (ctx == NULL)
> return -EINVAL;
> @@ -1335,16 +1328,12 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data,
> struct drm_file *filp)
> {
> struct amdgpu_device *adev = dev->dev_private;
> - struct amdgpu_fpriv *fpriv = filp->driver_priv;
> union drm_amdgpu_fence_to_handle *info = data;
> struct dma_fence *fence;
> struct drm_syncobj *syncobj;
> struct sync_file *sync_file;
> int fd, r;
>
> - if (amdgpu_kms_vram_lost(adev, fpriv))
> - return -ENODEV;
> -
> fence = amdgpu_cs_get_fence(adev, filp, &info->in.fence);
> if (IS_ERR(fence))
> return PTR_ERR(fence);
> @@ -1506,15 +1495,12 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
> struct drm_file *filp)
> {
> struct amdgpu_device *adev = dev->dev_private;
> - struct amdgpu_fpriv *fpriv = filp->driver_priv;
> union drm_amdgpu_wait_fences *wait = data;
> uint32_t fence_count = wait->in.fence_count;
> struct drm_amdgpu_fence *fences_user;
> struct drm_amdgpu_fence *fences;
> int r;
>
> - if (amdgpu_kms_vram_lost(adev, fpriv))
> - return -ENODEV;
> /* Get the fences from userspace */
> fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
> GFP_KERNEL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 24f6e3c..6d641e0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2951,10 +2951,8 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
> if (r)
> goto out;
> vram_lost = amdgpu_check_vram_lost(adev);
> - if (vram_lost) {
> + if (vram_lost)
> DRM_ERROR("VRAM is lost!\n");
> - atomic_inc(&adev->vram_lost_counter);
> - }
> r = amdgpu_ttm_recover_gart(adev);
> if (r)
> goto out;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index b0d45c8..c69048c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -577,11 +577,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
> args->operation);
> return -EINVAL;
> }
> - if ((args->operation == AMDGPU_VA_OP_MAP) ||
> - (args->operation == AMDGPU_VA_OP_REPLACE)) {
> - if (amdgpu_kms_vram_lost(adev, fpriv))
> - return -ENODEV;
> - }
>
> INIT_LIST_HEAD(&list);
> INIT_LIST_HEAD(&duplicates);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 4510627..63bd372 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -178,7 +178,6 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
> {
> struct dma_fence *fence = NULL;
> struct amdgpu_job *job;
> - struct amdgpu_fpriv *fpriv = NULL;
> int r;
>
> if (!sched_job) {
> @@ -190,16 +189,10 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
> BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>
> trace_amdgpu_sched_run_job(job);
> - if (job->vm)
> - fpriv = container_of(job->vm, struct amdgpu_fpriv, vm);
> - /* skip ib schedule when vram is lost */
> - if (fpriv && amdgpu_kms_vram_lost(job->adev, fpriv))
> - DRM_ERROR("Skip scheduling IBs!\n");
> - else {
> - r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence);
> - if (r)
> - DRM_ERROR("Error scheduling IBs (%d)\n", r);
> - }
> + r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence);
> + if (r)
> + DRM_ERROR("Error scheduling IBs (%d)\n", r);
> +
> /* if gpu reset, hw fence will be replaced here */
> dma_fence_put(job->fence);
> job->fence = dma_fence_get(fence);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 8c6fd56..0fc36b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -269,7 +269,6 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info,
> static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> {
> struct amdgpu_device *adev = dev->dev_private;
> - struct amdgpu_fpriv *fpriv = filp->driver_priv;
> struct drm_amdgpu_info *info = data;
> struct amdgpu_mode_info *minfo = &adev->mode_info;
> void __user *out = (void __user *)(uintptr_t)info->return_pointer;
> @@ -282,8 +281,6 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
>
> if (!info->return_size || !info->return_pointer)
> return -EINVAL;
> - if (amdgpu_kms_vram_lost(adev, fpriv))
> - return -ENODEV;
>
> switch (info->query) {
> case AMDGPU_INFO_ACCEL_WORKING:
> @@ -791,12 +788,6 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev)
> vga_switcheroo_process_delayed_switch();
> }
>
> -bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
> - struct amdgpu_fpriv *fpriv)
> -{
> - return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter);
> -}
> -
> /**
> * amdgpu_driver_open_kms - drm callback for open
> *
> @@ -853,7 +844,6 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
>
> amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
>
> - fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> file_priv->driver_priv = fpriv;
>
> out_suspend:
>
--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
More information about the amd-gfx
mailing list