[PATCH 2/2] drm/amdgpu: Poll of RAS errors asynchronously

Christian König ckoenig.leichtzumerken at gmail.com
Thu May 13 08:00:33 UTC 2021


Am 12.05.21 um 19:03 schrieb Luben Tuikov:
> When using Vega 20 with RAS support and RAS is
> enabled, the system interactivity is extremely
> slow, to the point of being unusable. After
> debugging, it was determined that this is due to
> the polling loop performed for
> AMDGPU_CTX_OP_QUERY_STATE2 under
> amdgpu_ctx_ioctl(), which seems to be executed on
> every ioctl from X/Mesa.
>
> The latter seems to call amdgpu_ctx_query2() which
> calls amdgpu_ras_query_error_count() twice, once
> for each of ue_count and ce_count. This is
> unnecessarily wasteful since
> amdgpu_ras_query_error_count() calculates both,
> but with the current interface it returns one or
> the other, depending on its Boolean input, when it
> can in fact return both, in a single invocation,
> if it had a better interface.
>
> Further down, the query_ras_error_count() callback
> is called, which could be quite a large polling
> loop, and very time consuming. For instance,
> gfx_v9_0_query_ras_error_count() is at least
> O(n^3). A similar situation is seen with
> umc_v6_1_query_ras_error_count().
>
> This commit implements asynchronous RAS error
> count polling to that of the ioctl. A kernel
> thread polls the RAS error counters once in a
> while. The ioctl reads the values
> asynchronously. The poll frequency is a module
> parameter, with range [500, 10000] milliseconds,
> default 3000.
>
> v2: Enable setting the polling interval to 0,
>      which disables the thread.

Please drop the module parameter, we already have way to many module 
parameters and that doesn't add much value.

Then I would really prefer to implement this as a delayed work item instead.

If you call schedule_delayed_work() from the amdgpu_ctx_query2() the 
work item will only be started every X jiffies when the function is 
actually used.

Regards,
Christian.

>
> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
> Cc: John Clements <john.clements at amd.com>
> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
> Reviewed-by: John Clements <john.clements at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 36 +++++++-------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 66 ++++++++++++++++++++++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 +--
>   5 files changed, 93 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 3147c1c935c8..a269f778194f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -197,6 +197,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
>   extern int amdgpu_ras_enable;
>   extern uint amdgpu_ras_mask;
>   extern int amdgpu_bad_page_threshold;
> +extern uint amdgpu_ras_thread_poll_ms;
>   extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
>   extern int amdgpu_async_gfx_ring;
>   extern int amdgpu_mcbp;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index d481a33f4eaf..558e887e99b6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -332,12 +332,12 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
>   }
>   
>   static int amdgpu_ctx_query2(struct amdgpu_device *adev,
> -	struct amdgpu_fpriv *fpriv, uint32_t id,
> -	union drm_amdgpu_ctx_out *out)
> +			     struct amdgpu_fpriv *fpriv, uint32_t id,
> +			     union drm_amdgpu_ctx_out *out)
>   {
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   	struct amdgpu_ctx *ctx;
>   	struct amdgpu_ctx_mgr *mgr;
> -	unsigned long ras_counter;
>   
>   	if (!fpriv)
>   		return -EINVAL;
> @@ -362,20 +362,22 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>   	if (atomic_read(&ctx->guilty))
>   		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>   
> -	/*query ue count*/
> -	/* ras_counter = amdgpu_ras_query_error_count(adev, false); */
> -	/* /\*ras counter is monotonic increasing*\/ */
> -	/* if (ras_counter != ctx->ras_counter_ue) { */
> -	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
> -	/* 	ctx->ras_counter_ue = ras_counter; */
> -	/* } */
> -
> -	/* /\*query ce count*\/ */
> -	/* ras_counter = amdgpu_ras_query_error_count(adev, true); */
> -	/* if (ras_counter != ctx->ras_counter_ce) { */
> -	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
> -	/* 	ctx->ras_counter_ce = ras_counter; */
> -	/* } */
> +	if (con) {
> +		int ce_count, ue_count;
> +
> +		ce_count = atomic_read(&con->ras_ce_count);
> +		ue_count = atomic_read(&con->ras_ue_count);
> +
> +		if (ce_count != ctx->ras_counter_ce) {
> +			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
> +			ctx->ras_counter_ce = ce_count;
> +		}
> +
> +		if (ue_count != ctx->ras_counter_ue) {
> +			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
> +			ctx->ras_counter_ue = ue_count;
> +		}
> +	}
>   
>   	mutex_unlock(&mgr->lock);
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 877469d288f8..641c374b8525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -183,6 +183,7 @@ struct amdgpu_mgpu_info mgpu_info = {
>   int amdgpu_ras_enable = -1;
>   uint amdgpu_ras_mask = 0xffffffff;
>   int amdgpu_bad_page_threshold = -1;
> +uint amdgpu_ras_thread_poll_ms = 3000;
>   struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
>   	.timeout_fatal_disable = false,
>   	.period = 0x0, /* default to 0x0 (timeout disable) */
> @@ -534,6 +535,15 @@ module_param_named(emu_mode, amdgpu_emu_mode, int, 0444);
>   MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))");
>   module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
>   
> +/**
> + * DOC: ras_thread_poll (uint)
> + * Number of milliseconds between RAS poll for errors.
> + * Valid range 0 and [500,10000]. Set to 0 to disable the thread.
> + * Default: 3000.
> + */
> +MODULE_PARM_DESC(ras_thread_poll, "Number of milliseconds between RAS poll for errors. Valid range 0 and [500,10000]. Set to 0 to disable the thread. Default: 3000.");
> +module_param_named(ras_thread_poll, amdgpu_ras_thread_poll_ms, uint, 0444);
> +
>   /**
>    * DOC: ras_mask (uint)
>    * Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index b1c57a5b6e89..30bec289e9ad 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1043,15 +1043,17 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>   }
>   
>   /* get the total error counts on all IPs */
> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
> -		bool is_ce)
> +static void amdgpu_ras_query_error_count(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   	struct ras_manager *obj;
> -	struct ras_err_data data = {0, 0};
> +	int ce_count, ue_count;
>   
>   	if (!adev->ras_enabled || !con)
> -		return 0;
> +		return;
> +
> +	ce_count = 0;
> +	ue_count = 0;
>   
>   	list_for_each_entry(obj, &con->head, node) {
>   		struct ras_query_if info = {
> @@ -1059,13 +1061,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>   		};
>   
>   		if (amdgpu_ras_query_error_status(adev, &info))
> -			return 0;
> +			return;
>   
> -		data.ce_count += info.ce_count;
> -		data.ue_count += info.ue_count;
> +		ce_count += info.ce_count;
> +		ue_count += info.ue_count;
>   	}
>   
> -	return is_ce ? data.ce_count : data.ue_count;
> +	atomic_set(&con->ras_ce_count, ce_count);
> +	atomic_set(&con->ras_ue_count, ue_count);
>   }
>   /* query/inject/cure end */
>   
> @@ -2109,6 +2112,49 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
>   		adev->ras_hw_enabled & amdgpu_ras_mask;
>   }
>   
> +static int amdgpu_ras_thread(void *data)
> +{
> +	struct amdgpu_device *adev = data;
> +	struct amdgpu_ras    *con  = amdgpu_ras_get_context(adev);
> +
> +	con->ras_thread_poll_ms = amdgpu_ras_thread_poll_ms;
> +	if (con->ras_thread_poll_ms == 0) {
> +		atomic_set(&con->ras_ce_count, 0);
> +		atomic_set(&con->ras_ue_count, 0);
> +		return 0;
> +	} else if (con->ras_thread_poll_ms < 500 ||
> +		   con->ras_thread_poll_ms > 10000) {
> +		con->ras_thread_poll_ms = 3000;
> +	}
> +
> +	while (1) {
> +		if (kthread_should_stop())
> +			break;
> +		if (kthread_should_park())
> +			kthread_parkme();
> +
> +		amdgpu_ras_query_error_count(adev);
> +		msleep_interruptible(con->ras_thread_poll_ms);
> +	}
> +
> +	return 0;
> +}
> +
> +static int amdgpu_ras_thread_start(struct amdgpu_device *adev)
> +{
> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct task_struct *kt;
> +
> +	kt = kthread_run(amdgpu_ras_thread, adev,
> +			 "amdras:%s", pci_name(adev->pdev));
> +	if (IS_ERR(kt))
> +		return PTR_ERR(kt);
> +
> +	con->ras_thread = kt;
> +
> +	return 0;
> +}
> +
>   int amdgpu_ras_init(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> @@ -2182,6 +2228,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   		goto release_con;
>   	}
>   
> +	r = amdgpu_ras_thread_start(adev);
> +	if (r)
> +		goto release_con;
> +
>   	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>   		 "hardware ability[%x] ras_mask[%x]\n",
>   		 adev->ras_hw_enabled, adev->ras_enabled);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 201fbdee1d09..fb9e4c7ab054 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -340,6 +340,11 @@ struct amdgpu_ras {
>   
>   	/* disable ras error count harvest in recovery */
>   	bool disable_ras_err_cnt_harvest;
> +
> +	struct task_struct *ras_thread;
> +	unsigned int        ras_thread_poll_ms;
> +	atomic_t            ras_ue_count;
> +	atomic_t            ras_ce_count;
>   };
>   
>   struct ras_fs_data {
> @@ -485,9 +490,6 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>   void amdgpu_ras_resume(struct amdgpu_device *adev);
>   void amdgpu_ras_suspend(struct amdgpu_device *adev);
>   
> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
> -		bool is_ce);
> -
>   /* error handling functions */
>   int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   		struct eeprom_table_record *bps, int pages);



More information about the amd-gfx mailing list