[PATCH 3/3] drm/amdgpu: fix gpu reset issue

He, Hongbo Hongbo.He at amd.com
Fri Apr 21 09:00:11 UTC 2017


Hi Christian:

During GPU reset, IOCTL still  submit command to GPU.
That will result in lots of strange problems when we test this feature.

I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion. 
But that doesn't work and it lead to other problems.

Thanks
Roger(Hongbo.He)

-----Original Message-----
From: Christian König [mailto:deathsimple at vodafone.de] 
Sent: Friday, April 21, 2017 4:27 PM
To: He, Hongbo; amd-gfx at lists.freedesktop.org
Cc: Zhou, David(ChunMing)
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue

NAK, that is exactly what we wanted to avoid.

We used to have an exclusive lock for this and it cause a whole bunch of problems.

Please elaborate why that should be necessary.

Regards,
Christian.

Am 21.04.2017 um 09:08 schrieb Roger.He:
> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> Signed-off-by: Roger.He <Hongbo.He at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c  |  7 ++++++-
>   4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71364f5..ab0ffa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
>   	atomic64_t			num_bytes_moved;
>   	atomic64_t			num_evictions;
>   	atomic_t			gpu_reset_counter;
> +	atomic_t			gpu_reset_state;
>   
>   	/* data for buffer migration throttling */
>   	struct {
> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_psp_check_fw_loading_status(adev, i) 
> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>   
>   /* Common functions */
> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev) 
> +{
> +	return atomic_read(&adev->gpu_reset_state);
> +}
> +
>   int amdgpu_gpu_reset(struct amdgpu_device *adev);
>   bool amdgpu_need_backup(struct amdgpu_device *adev);
>   void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f882496..0fb4716 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	mutex_init(&adev->grbm_idx_mutex);
>   	mutex_init(&adev->mn_lock);
>   	hash_init(adev->mn_hash);
> +	atomic_set(&adev->gpu_reset_state, 0);
>   
>   	amdgpu_check_arguments(adev);
>   
> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
>   }
>   
>   /**
> + * amdgpu_device_set_reset_state - set gpu reset state
> + *
> + * @adev: amdgpu device pointer
> + * @state: true when start to reset gpu; false: reset done  */ static 
> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
> +							bool state)
> +{
> +	atomic_set(&adev->gpu_reset_state, state); }
> +
> +/**
>    * amdgpu_gpu_reset - reset the asic
>    *
>    * @adev: amdgpu device pointer
> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   	}
>   
>   	atomic_inc(&adev->gpu_reset_counter);
> -
> +	amdgpu_device_set_reset_state(adev, true);
>   	/* block TTM */
>   	resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
>   	/* store modesetting */
> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
>   		dev_info(adev->dev, "GPU reset failed\n");
>   	}
>   
> +	amdgpu_device_set_reset_state(adev, false);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ead00d7..8cc14af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
>   	struct drm_file *file_priv = filp->private_data;
>   	struct drm_device *dev;
>   	long ret;
> +
>   	dev = file_priv->minor->dev;
>   	ret = pm_runtime_get_sync(dev->dev);
>   	if (ret < 0)
>   		return ret;
>   
> +	while (amdgpu_device_is_reset(dev->dev_private))
> +		msleep(100);
> +
>   	ret = drm_ioctl(filp, cmd, arg);
>   
>   	pm_runtime_mark_last_busy(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> index 2648291..22b8059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> @@ -36,10 +36,15 @@
>   long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	unsigned int nr = DRM_IOCTL_NR(cmd);
> +	struct drm_file *file_priv = filp->private_data;
> +	struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
>   	int ret;
>   
> -	if (nr < DRM_COMMAND_BASE)
> +	if (nr < DRM_COMMAND_BASE) {
> +		while (amdgpu_device_is_reset(adev))
> +			msleep(100);
>   		return drm_compat_ioctl(filp, cmd, arg);
> +	}
>   
>   	ret = amdgpu_drm_ioctl(filp, cmd, arg);
>   




More information about the amd-gfx mailing list