[PATCH 2/2] drm/amdgpu: Poll of RAS errors asynchronously

Thu May 13 19:34:59 UTC 2021

On 2021-05-13 4:00 a.m., Christian König wrote:
> Am 12.05.21 um 19:03 schrieb Luben Tuikov:
>> When using Vega 20 with RAS support and RAS is
>> enabled, the system interactivity is extremely
>> slow, to the point of being unusable. After
>> debugging, it was determined that this is due to
>> the polling loop performed for
>> AMDGPU_CTX_OP_QUERY_STATE2 under
>> amdgpu_ctx_ioctl(), which seems to be executed on
>> every ioctl from X/Mesa.
>>
>> The latter seems to call amdgpu_ctx_query2() which
>> calls amdgpu_ras_query_error_count() twice, once
>> for each of ue_count and ce_count. This is
>> unnecessarily wasteful since
>> amdgpu_ras_query_error_count() calculates both,
>> but with the current interface it returns one or
>> the other, depending on its Boolean input, when it
>> can in fact return both, in a single invocation,
>> if it had a better interface.
>>
>> Further down, the query_ras_error_count() callback
>> is called, which could be quite a large polling
>> loop, and very time consuming. For instance,
>> gfx_v9_0_query_ras_error_count() is at least
>> O(n^3). A similar situation is seen with
>> umc_v6_1_query_ras_error_count().
>>
>> This commit implements asynchronous RAS error
>> count polling to that of the ioctl. A kernel
>> thread polls the RAS error counters once in a
>> while. The ioctl reads the values
>> asynchronously. The poll frequency is a module
>> parameter, with range [500, 10000] milliseconds,
>> default 3000.
>>
>> v2: Enable setting the polling interval to 0,
>>      which disables the thread.
> Please drop the module parameter, we already have way to many module 
> parameters and that doesn't add much value.

No problem.

So three seconds then?

>
> Then I would really prefer to implement this as a delayed work item instead.
>
> If you call schedule_delayed_work() from the amdgpu_ctx_query2() the 
> work item will only be started every X jiffies when the function is 
> actually used.

Yes, you mentioned this in our meeting and I did reply to this then:

I'd rather keep it a thread, because it shows intention, it shows that
we've committed that this is work which we need done, periodically
and that it is something we want to do--using a thread makes it
visible, known, intentional.

I don't mind using "delayed work", for "work" items which are one-off,
or something intermittent, non-regular type of work, non-periodic type of work.

Regards,
Luben

>
> Regards,
> Christian.
>
>> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
>> Cc: John Clements <john.clements at amd.com>
>> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>> Reviewed-by: John Clements <john.clements at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 36 +++++++-------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 66 ++++++++++++++++++++++---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 +--
>>   5 files changed, 93 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 3147c1c935c8..a269f778194f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -197,6 +197,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
>>   extern int amdgpu_ras_enable;
>>   extern uint amdgpu_ras_mask;
>>   extern int amdgpu_bad_page_threshold;
>> +extern uint amdgpu_ras_thread_poll_ms;
>>   extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
>>   extern int amdgpu_async_gfx_ring;
>>   extern int amdgpu_mcbp;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index d481a33f4eaf..558e887e99b6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -332,12 +332,12 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
>>   }
>>   
>>   static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>> -	struct amdgpu_fpriv *fpriv, uint32_t id,
>> -	union drm_amdgpu_ctx_out *out)
>> +			     struct amdgpu_fpriv *fpriv, uint32_t id,
>> +			     union drm_amdgpu_ctx_out *out)
>>   {
>> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>>   	struct amdgpu_ctx *ctx;
>>   	struct amdgpu_ctx_mgr *mgr;
>> -	unsigned long ras_counter;
>>   
>>   	if (!fpriv)
>>   		return -EINVAL;
>> @@ -362,20 +362,22 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>>   	if (atomic_read(&ctx->guilty))
>>   		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>>   
>> -	/*query ue count*/
>> -	/* ras_counter = amdgpu_ras_query_error_count(adev, false); */
>> -	/* /\*ras counter is monotonic increasing*\/ */
>> -	/* if (ras_counter != ctx->ras_counter_ue) { */
>> -	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
>> -	/* 	ctx->ras_counter_ue = ras_counter; */
>> -	/* } */
>> -
>> -	/* /\*query ce count*\/ */
>> -	/* ras_counter = amdgpu_ras_query_error_count(adev, true); */
>> -	/* if (ras_counter != ctx->ras_counter_ce) { */
>> -	/* 	out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
>> -	/* 	ctx->ras_counter_ce = ras_counter; */
>> -	/* } */
>> +	if (con) {
>> +		int ce_count, ue_count;
>> +
>> +		ce_count = atomic_read(&con->ras_ce_count);
>> +		ue_count = atomic_read(&con->ras_ue_count);
>> +
>> +		if (ce_count != ctx->ras_counter_ce) {
>> +			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
>> +			ctx->ras_counter_ce = ce_count;
>> +		}
>> +
>> +		if (ue_count != ctx->ras_counter_ue) {
>> +			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
>> +			ctx->ras_counter_ue = ue_count;
>> +		}
>> +	}
>>   
>>   	mutex_unlock(&mgr->lock);
>>   	return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 877469d288f8..641c374b8525 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -183,6 +183,7 @@ struct amdgpu_mgpu_info mgpu_info = {
>>   int amdgpu_ras_enable = -1;
>>   uint amdgpu_ras_mask = 0xffffffff;
>>   int amdgpu_bad_page_threshold = -1;
>> +uint amdgpu_ras_thread_poll_ms = 3000;
>>   struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
>>   	.timeout_fatal_disable = false,
>>   	.period = 0x0, /* default to 0x0 (timeout disable) */
>> @@ -534,6 +535,15 @@ module_param_named(emu_mode, amdgpu_emu_mode, int, 0444);
>>   MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))");
>>   module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
>>   
>> +/**
>> + * DOC: ras_thread_poll (uint)
>> + * Number of milliseconds between RAS poll for errors.
>> + * Valid range 0 and [500,10000]. Set to 0 to disable the thread.
>> + * Default: 3000.
>> + */
>> +MODULE_PARM_DESC(ras_thread_poll, "Number of milliseconds between RAS poll for errors. Valid range 0 and [500,10000]. Set to 0 to disable the thread. Default: 3000.");
>> +module_param_named(ras_thread_poll, amdgpu_ras_thread_poll_ms, uint, 0444);
>> +
>>   /**
>>    * DOC: ras_mask (uint)
>>    * Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index b1c57a5b6e89..30bec289e9ad 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -1043,15 +1043,17 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>>   }
>>   
>>   /* get the total error counts on all IPs */
>> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>> -		bool is_ce)
>> +static void amdgpu_ras_query_error_count(struct amdgpu_device *adev)
>>   {
>>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>>   	struct ras_manager *obj;
>> -	struct ras_err_data data = {0, 0};
>> +	int ce_count, ue_count;
>>   
>>   	if (!adev->ras_enabled || !con)
>> -		return 0;
>> +		return;
>> +
>> +	ce_count = 0;
>> +	ue_count = 0;
>>   
>>   	list_for_each_entry(obj, &con->head, node) {
>>   		struct ras_query_if info = {
>> @@ -1059,13 +1061,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>>   		};
>>   
>>   		if (amdgpu_ras_query_error_status(adev, &info))
>> -			return 0;
>> +			return;
>>   
>> -		data.ce_count += info.ce_count;
>> -		data.ue_count += info.ue_count;
>> +		ce_count += info.ce_count;
>> +		ue_count += info.ue_count;
>>   	}
>>   
>> -	return is_ce ? data.ce_count : data.ue_count;
>> +	atomic_set(&con->ras_ce_count, ce_count);
>> +	atomic_set(&con->ras_ue_count, ue_count);
>>   }
>>   /* query/inject/cure end */
>>   
>> @@ -2109,6 +2112,49 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
>>   		adev->ras_hw_enabled & amdgpu_ras_mask;
>>   }
>>   
>> +static int amdgpu_ras_thread(void *data)
>> +{
>> +	struct amdgpu_device *adev = data;
>> +	struct amdgpu_ras    *con  = amdgpu_ras_get_context(adev);
>> +
>> +	con->ras_thread_poll_ms = amdgpu_ras_thread_poll_ms;
>> +	if (con->ras_thread_poll_ms == 0) {
>> +		atomic_set(&con->ras_ce_count, 0);
>> +		atomic_set(&con->ras_ue_count, 0);
>> +		return 0;
>> +	} else if (con->ras_thread_poll_ms < 500 ||
>> +		   con->ras_thread_poll_ms > 10000) {
>> +		con->ras_thread_poll_ms = 3000;
>> +	}
>> +
>> +	while (1) {
>> +		if (kthread_should_stop())
>> +			break;
>> +		if (kthread_should_park())
>> +			kthread_parkme();
>> +
>> +		amdgpu_ras_query_error_count(adev);
>> +		msleep_interruptible(con->ras_thread_poll_ms);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int amdgpu_ras_thread_start(struct amdgpu_device *adev)
>> +{
>> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> +	struct task_struct *kt;
>> +
>> +	kt = kthread_run(amdgpu_ras_thread, adev,
>> +			 "amdras:%s", pci_name(adev->pdev));
>> +	if (IS_ERR(kt))
>> +		return PTR_ERR(kt);
>> +
>> +	con->ras_thread = kt;
>> +
>> +	return 0;
>> +}
>> +
>>   int amdgpu_ras_init(struct amdgpu_device *adev)
>>   {
>>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> @@ -2182,6 +2228,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>>   		goto release_con;
>>   	}
>>   
>> +	r = amdgpu_ras_thread_start(adev);
>> +	if (r)
>> +		goto release_con;
>> +
>>   	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>>   		 "hardware ability[%x] ras_mask[%x]\n",
>>   		 adev->ras_hw_enabled, adev->ras_enabled);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 201fbdee1d09..fb9e4c7ab054 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -340,6 +340,11 @@ struct amdgpu_ras {
>>   
>>   	/* disable ras error count harvest in recovery */
>>   	bool disable_ras_err_cnt_harvest;
>> +
>> +	struct task_struct *ras_thread;
>> +	unsigned int        ras_thread_poll_ms;
>> +	atomic_t            ras_ue_count;
>> +	atomic_t            ras_ce_count;
>>   };
>>   
>>   struct ras_fs_data {
>> @@ -485,9 +490,6 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>>   void amdgpu_ras_resume(struct amdgpu_device *adev);
>>   void amdgpu_ras_suspend(struct amdgpu_device *adev);
>>   
>> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>> -		bool is_ce);
>> -
>>   /* error handling functions */
>>   int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>>   		struct eeprom_table_record *bps, int pages);