[PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Tue Mar 29 07:49:09 UTC 2022

Am 29.03.22 um 09:38 schrieb yipechai:
> Some AMDGPU RAS debugfs operations like UE injection
> can cause gpu reset. Before doing the next debugfs
> operation, the application should call poll to check
> if the gpu has finished recovering.

Well NAK. debugfs files are designed to be used from the command-line 
and can be opened and read/written at any time.

If we need to prevent issuing the next operation before the previous one 
has finished we need to use locks for that.

Especially if not doing so results in a hard system lockup.

Regards,
Christian.

>
> Signed-off-by: yipechai <YiPeng.Chai at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
>   2 files changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 4bbed76b79c8..337e3e247a45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>   
>   		/* data.inject.address is offset instead of absolute gpu address */
>   		ret = amdgpu_ras_error_inject(adev, &data.inject);
> +
> +		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
> +			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> +			con->ras_ue_injected = 1;
> +		}
>   		break;
>   	default:
>   		ret = -EINVAL;
> @@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>   	return size;
>   }
>   
> +/**
> + * DOC: Support AMDGPU RAS debugfs poll interface
> + *
> + * Some AMDGPU RAS debugfs operations like UE injection
> + * can cause gpu reset. Before doing the next debugfs
> + * operation, the application should call poll to check
> + * if gpu is in recovering status.
> + */
> +static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct poll_table_struct *wait)
> +{
> +	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	__poll_t mask = 0;
> +
> +	/* For UE injection, wait for gpu to finish recovery */
> +	if (con->ras_ue_injected)
> +		poll_wait(f, &con->gpu_ready_wait_wq, wait);
> +
> +	if (!atomic_read(&con->in_recovery))
> +		mask = EPOLLIN | EPOLLRDNORM;
> +
> +	return mask;
> +}
> +
>   /**
>    * DOC: AMDGPU RAS debugfs EEPROM table reset interface
>    *
> @@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
>   
>   static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
>   	.owner = THIS_MODULE,
> +	.poll = amdgpu_ras_debugfs_ctrl_poll,
>   	.read = NULL,
>   	.write = amdgpu_ras_debugfs_ctrl_write,
>   	.llseek = default_llseek
> @@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
>   	if (amdgpu_device_should_recover_gpu(ras->adev))
>   		amdgpu_device_gpu_recover(ras->adev, NULL);
>   	atomic_set(&ras->in_recovery, 0);
> +
> +	if (ras->ras_ue_injected) {
> +		ras->ras_ue_injected = 0;
> +		wake_up_all(&ras->gpu_ready_wait_wq);
> +	}
>   }
>   
>   /* alloc/realloc bps array */
> @@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
>   	atomic_set(&con->ras_ce_count, 0);
>   	atomic_set(&con->ras_ue_count, 0);
> -
> +	init_waitqueue_head(&con->gpu_ready_wait_wq);
>   	con->objs = (struct ras_manager *)(con + 1);
>   
>   	amdgpu_ras_set_context(adev, con);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 606df8869b89..aea6bbb71501 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -379,6 +379,12 @@ struct amdgpu_ras {
>   
>   	/* Indicates smu whether need update bad channel info */
>   	bool update_channel_flag;
> +
> +	/* UE injection flag */
> +	uint32_t  ras_ue_injected;
> +
> +	/* Waiting for gpu ready work queue */
> +	wait_queue_head_t gpu_ready_wait_wq;
>   };
>   
>   struct ras_fs_data {