[PATCH 3/3] drm/amdgpu: fix gpu reset issue
Liu, Monk
Monk.Liu at amd.com
Thu May 4 10:42:18 UTC 2017
First of all, Your patch will impact on SR-IOV case, because "gpu_reset_state" will be referenced by two place:
1) IOCTL; 2)GPU-RESET ,
SRIOV use srio_gpu_reset, but SRIOV use the same IOCTL routines, that way the gpu_reset_state is not consistent for SR-IOV case
I suggest you firstly unify this gpu_reset_state for both SRIOV and bare-metal case
For sriov, we have "adev->gfx.in_reset " and you now have "adev->gpu_reset_state",
I suggest you replace all "adev->gfx.in_reset" with "adev->gpu_reset_state",
BR Monk
-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf Of zhoucm1
Sent: Friday, April 21, 2017 3:18 PM
To: He, Hongbo <Hongbo.He at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue
On 2017年04月21日 15:08, Roger.He wrote:
> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c
> Signed-off-by: Roger.He <Hongbo.He at amd.com>
Please add more explaination in comment, like "prevent ioctl during gpu reset, otherwise many un-expected behaviour happen."
Regards,
David Zhou
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c | 7 ++++++-
> 4 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 71364f5..ab0ffa8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1526,6 +1526,7 @@ struct amdgpu_device {
> atomic64_t num_bytes_moved;
> atomic64_t num_evictions;
> atomic_t gpu_reset_counter;
> + atomic_t gpu_reset_state;
>
> /* data for buffer migration throttling */
> struct {
> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
> #define amdgpu_psp_check_fw_loading_status(adev, i)
> (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
>
> /* Common functions */
> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev)
> +{
> + return atomic_read(&adev->gpu_reset_state);
> +}
> +
> int amdgpu_gpu_reset(struct amdgpu_device *adev);
> bool amdgpu_need_backup(struct amdgpu_device *adev);
> void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f882496..0fb4716 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(&adev->grbm_idx_mutex);
> mutex_init(&adev->mn_lock);
> hash_init(adev->mn_hash);
> + atomic_set(&adev->gpu_reset_state, 0);
>
> amdgpu_check_arguments(adev);
>
> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary)
> }
>
> /**
> + * amdgpu_device_set_reset_state - set gpu reset state
> + *
> + * @adev: amdgpu device pointer
> + * @state: true when start to reset gpu; false: reset done */ static
> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev,
> + bool state)
> +{
> + atomic_set(&adev->gpu_reset_state, state); }
> +
> +/**
> * amdgpu_gpu_reset - reset the asic
> *
> * @adev: amdgpu device pointer
> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
> }
>
> atomic_inc(&adev->gpu_reset_counter);
> -
> + amdgpu_device_set_reset_state(adev, true);
> /* block TTM */
> resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
> /* store modesetting */
> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
> dev_info(adev->dev, "GPU reset failed\n");
> }
>
> + amdgpu_device_set_reset_state(adev, false);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index ead00d7..8cc14af 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp,
> struct drm_file *file_priv = filp->private_data;
> struct drm_device *dev;
> long ret;
> +
> dev = file_priv->minor->dev;
> ret = pm_runtime_get_sync(dev->dev);
> if (ret < 0)
> return ret;
>
> + while (amdgpu_device_is_reset(dev->dev_private))
> + msleep(100);
> +
> ret = drm_ioctl(filp, cmd, arg);
>
> pm_runtime_mark_last_busy(dev->dev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> index 2648291..22b8059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c
> @@ -36,10 +36,15 @@
> long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> {
> unsigned int nr = DRM_IOCTL_NR(cmd);
> + struct drm_file *file_priv = filp->private_data;
> + struct amdgpu_device *adev = file_priv->minor->dev->dev_private;
> int ret;
>
> - if (nr < DRM_COMMAND_BASE)
> + if (nr < DRM_COMMAND_BASE) {
> + while (amdgpu_device_is_reset(adev))
> + msleep(100);
> return drm_compat_ioctl(filp, cmd, arg);
> + }
>
> ret = amdgpu_drm_ioctl(filp, cmd, arg);
>
_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list