[PATCH 2/2] drm/amdgpu: Poll of RAS errors asynchronously
Luben Tuikov
luben.tuikov at amd.com
Thu May 13 19:34:59 UTC 2021
On 2021-05-13 4:00 a.m., Christian König wrote:
> Am 12.05.21 um 19:03 schrieb Luben Tuikov:
>> When using Vega 20 with RAS support and RAS is
>> enabled, the system interactivity is extremely
>> slow, to the point of being unusable. After
>> debugging, it was determined that this is due to
>> the polling loop performed for
>> AMDGPU_CTX_OP_QUERY_STATE2 under
>> amdgpu_ctx_ioctl(), which seems to be executed on
>> every ioctl from X/Mesa.
>>
>> The latter seems to call amdgpu_ctx_query2() which
>> calls amdgpu_ras_query_error_count() twice, once
>> for each of ue_count and ce_count. This is
>> unnecessarily wasteful since
>> amdgpu_ras_query_error_count() calculates both,
>> but with the current interface it returns one or
>> the other, depending on its Boolean input, when it
>> can in fact return both, in a single invocation,
>> if it had a better interface.
>>
>> Further down, the query_ras_error_count() callback
>> is called, which could be quite a large polling
>> loop, and very time consuming. For instance,
>> gfx_v9_0_query_ras_error_count() is at least
>> O(n^3). A similar situation is seen with
>> umc_v6_1_query_ras_error_count().
>>
>> This commit implements asynchronous RAS error
>> count polling to that of the ioctl. A kernel
>> thread polls the RAS error counters once in a
>> while. The ioctl reads the values
>> asynchronously. The poll frequency is a module
>> parameter, with range [500, 10000] milliseconds,
>> default 3000.
>>
>> v2: Enable setting the polling interval to 0,
>> which disables the thread.
> Please drop the module parameter, we already have way to many module
> parameters and that doesn't add much value.
No problem.
So three seconds then?
>
> Then I would really prefer to implement this as a delayed work item instead.
>
> If you call schedule_delayed_work() from the amdgpu_ctx_query2() the
> work item will only be started every X jiffies when the function is
> actually used.
Yes, you mentioned this in our meeting and I did reply to this then:
I'd rather keep it a thread, because it shows intention, it shows that
we've committed that this is work which we need done, periodically
and that it is something we want to do--using a thread makes it
visible, known, intentional.
I don't mind using "delayed work", for "work" items which are one-off,
or something intermittent, non-regular type of work, non-periodic type of work.
Regards,
Luben
>
> Regards,
> Christian.
>
>> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
>> Cc: John Clements <john.clements at amd.com>
>> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>> Reviewed-by: John Clements <john.clements at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 36 +++++++-------
>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 66 ++++++++++++++++++++++---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 +--
>> 5 files changed, 93 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 3147c1c935c8..a269f778194f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -197,6 +197,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
>> extern int amdgpu_ras_enable;
>> extern uint amdgpu_ras_mask;
>> extern int amdgpu_bad_page_threshold;
>> +extern uint amdgpu_ras_thread_poll_ms;
>> extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
>> extern int amdgpu_async_gfx_ring;
>> extern int amdgpu_mcbp;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index d481a33f4eaf..558e887e99b6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -332,12 +332,12 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
>> }
>>
>> static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>> - struct amdgpu_fpriv *fpriv, uint32_t id,
>> - union drm_amdgpu_ctx_out *out)
>> + struct amdgpu_fpriv *fpriv, uint32_t id,
>> + union drm_amdgpu_ctx_out *out)
>> {
>> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> struct amdgpu_ctx *ctx;
>> struct amdgpu_ctx_mgr *mgr;
>> - unsigned long ras_counter;
>>
>> if (!fpriv)
>> return -EINVAL;
>> @@ -362,20 +362,22 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>> if (atomic_read(&ctx->guilty))
>> out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>>
>> - /*query ue count*/
>> - /* ras_counter = amdgpu_ras_query_error_count(adev, false); */
>> - /* /\*ras counter is monotonic increasing*\/ */
>> - /* if (ras_counter != ctx->ras_counter_ue) { */
>> - /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
>> - /* ctx->ras_counter_ue = ras_counter; */
>> - /* } */
>> -
>> - /* /\*query ce count*\/ */
>> - /* ras_counter = amdgpu_ras_query_error_count(adev, true); */
>> - /* if (ras_counter != ctx->ras_counter_ce) { */
>> - /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
>> - /* ctx->ras_counter_ce = ras_counter; */
>> - /* } */
>> + if (con) {
>> + int ce_count, ue_count;
>> +
>> + ce_count = atomic_read(&con->ras_ce_count);
>> + ue_count = atomic_read(&con->ras_ue_count);
>> +
>> + if (ce_count != ctx->ras_counter_ce) {
>> + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
>> + ctx->ras_counter_ce = ce_count;
>> + }
>> +
>> + if (ue_count != ctx->ras_counter_ue) {
>> + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
>> + ctx->ras_counter_ue = ue_count;
>> + }
>> + }
>>
>> mutex_unlock(&mgr->lock);
>> return 0;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 877469d288f8..641c374b8525 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -183,6 +183,7 @@ struct amdgpu_mgpu_info mgpu_info = {
>> int amdgpu_ras_enable = -1;
>> uint amdgpu_ras_mask = 0xffffffff;
>> int amdgpu_bad_page_threshold = -1;
>> +uint amdgpu_ras_thread_poll_ms = 3000;
>> struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
>> .timeout_fatal_disable = false,
>> .period = 0x0, /* default to 0x0 (timeout disable) */
>> @@ -534,6 +535,15 @@ module_param_named(emu_mode, amdgpu_emu_mode, int, 0444);
>> MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))");
>> module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
>>
>> +/**
>> + * DOC: ras_thread_poll (uint)
>> + * Number of milliseconds between RAS poll for errors.
>> + * Valid range 0 and [500,10000]. Set to 0 to disable the thread.
>> + * Default: 3000.
>> + */
>> +MODULE_PARM_DESC(ras_thread_poll, "Number of milliseconds between RAS poll for errors. Valid range 0 and [500,10000]. Set to 0 to disable the thread. Default: 3000.");
>> +module_param_named(ras_thread_poll, amdgpu_ras_thread_poll_ms, uint, 0444);
>> +
>> /**
>> * DOC: ras_mask (uint)
>> * Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index b1c57a5b6e89..30bec289e9ad 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -1043,15 +1043,17 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>> }
>>
>> /* get the total error counts on all IPs */
>> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>> - bool is_ce)
>> +static void amdgpu_ras_query_error_count(struct amdgpu_device *adev)
>> {
>> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> struct ras_manager *obj;
>> - struct ras_err_data data = {0, 0};
>> + int ce_count, ue_count;
>>
>> if (!adev->ras_enabled || !con)
>> - return 0;
>> + return;
>> +
>> + ce_count = 0;
>> + ue_count = 0;
>>
>> list_for_each_entry(obj, &con->head, node) {
>> struct ras_query_if info = {
>> @@ -1059,13 +1061,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>> };
>>
>> if (amdgpu_ras_query_error_status(adev, &info))
>> - return 0;
>> + return;
>>
>> - data.ce_count += info.ce_count;
>> - data.ue_count += info.ue_count;
>> + ce_count += info.ce_count;
>> + ue_count += info.ue_count;
>> }
>>
>> - return is_ce ? data.ce_count : data.ue_count;
>> + atomic_set(&con->ras_ce_count, ce_count);
>> + atomic_set(&con->ras_ue_count, ue_count);
>> }
>> /* query/inject/cure end */
>>
>> @@ -2109,6 +2112,49 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
>> adev->ras_hw_enabled & amdgpu_ras_mask;
>> }
>>
>> +static int amdgpu_ras_thread(void *data)
>> +{
>> + struct amdgpu_device *adev = data;
>> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> +
>> + con->ras_thread_poll_ms = amdgpu_ras_thread_poll_ms;
>> + if (con->ras_thread_poll_ms == 0) {
>> + atomic_set(&con->ras_ce_count, 0);
>> + atomic_set(&con->ras_ue_count, 0);
>> + return 0;
>> + } else if (con->ras_thread_poll_ms < 500 ||
>> + con->ras_thread_poll_ms > 10000) {
>> + con->ras_thread_poll_ms = 3000;
>> + }
>> +
>> + while (1) {
>> + if (kthread_should_stop())
>> + break;
>> + if (kthread_should_park())
>> + kthread_parkme();
>> +
>> + amdgpu_ras_query_error_count(adev);
>> + msleep_interruptible(con->ras_thread_poll_ms);
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int amdgpu_ras_thread_start(struct amdgpu_device *adev)
>> +{
>> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> + struct task_struct *kt;
>> +
>> + kt = kthread_run(amdgpu_ras_thread, adev,
>> + "amdras:%s", pci_name(adev->pdev));
>> + if (IS_ERR(kt))
>> + return PTR_ERR(kt);
>> +
>> + con->ras_thread = kt;
>> +
>> + return 0;
>> +}
>> +
>> int amdgpu_ras_init(struct amdgpu_device *adev)
>> {
>> struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> @@ -2182,6 +2228,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>> goto release_con;
>> }
>>
>> + r = amdgpu_ras_thread_start(adev);
>> + if (r)
>> + goto release_con;
>> +
>> dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
>> "hardware ability[%x] ras_mask[%x]\n",
>> adev->ras_hw_enabled, adev->ras_enabled);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 201fbdee1d09..fb9e4c7ab054 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -340,6 +340,11 @@ struct amdgpu_ras {
>>
>> /* disable ras error count harvest in recovery */
>> bool disable_ras_err_cnt_harvest;
>> +
>> + struct task_struct *ras_thread;
>> + unsigned int ras_thread_poll_ms;
>> + atomic_t ras_ue_count;
>> + atomic_t ras_ce_count;
>> };
>>
>> struct ras_fs_data {
>> @@ -485,9 +490,6 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>> void amdgpu_ras_resume(struct amdgpu_device *adev);
>> void amdgpu_ras_suspend(struct amdgpu_device *adev);
>>
>> -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>> - bool is_ce);
>> -
>> /* error handling functions */
>> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>> struct eeprom_table_record *bps, int pages);
More information about the amd-gfx
mailing list