[PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters

Tue May 25 23:56:48 UTC 2021

On 2021-05-25 6:03 p.m., Alex Deucher wrote:
> On Fri, May 21, 2021 at 5:19 PM Luben Tuikov <luben.tuikov at amd.com> wrote:
>> On Context Query2 IOCTL return the correctable and
>> uncorrectable errors in O(1) fashion, from cached
>> values, and schedule a delayed work function to
>> calculate and cache them for the next such IOCTL.
> Patches 1, 2, are:
> Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
>
> For patch 3, I think we need to cancel any outstanding delayed work in
> ras_fini().  Other than that, it looks good to me.
Ah, yes, good point--I missed that. I'll add it and resubmit.

Regards,
Luben

>
> Alex
>
>> Cc: Alexander Deucher <Alexander.Deucher at amd.com>
>> Cc: Christian König <christian.koenig at amd.com>
>> Cc: John Clements <john.clements at amd.com>
>> Cc: Hawking Zhang <Hawking.Zhang at amd.com>
>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 32 +++++++++++++++++++--
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +++++++++++++++++++++++++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  5 ++++
>>  3 files changed, 73 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index bb0cfe871aba..4e95d255960b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
>>         return 0;
>>  }
>>
>> +#define AMDGPU_RAS_COUNTE_DELAY_MS 3000
>> +
>>  static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>> -       struct amdgpu_fpriv *fpriv, uint32_t id,
>> -       union drm_amdgpu_ctx_out *out)
>> +                            struct amdgpu_fpriv *fpriv, uint32_t id,
>> +                            union drm_amdgpu_ctx_out *out)
>>  {
>> +       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>>         struct amdgpu_ctx *ctx;
>>         struct amdgpu_ctx_mgr *mgr;
>>
>> @@ -361,6 +364,31 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
>>         if (atomic_read(&ctx->guilty))
>>                 out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
>>
>> +       if (adev->ras_enabled && con) {
>> +               /* Return the cached values in O(1),
>> +                * and schedule delayed work to cache
>> +                * new vaues.
>> +                */
>> +               int ce_count, ue_count;
>> +
>> +               ce_count = atomic_read(&con->ras_ce_count);
>> +               ue_count = atomic_read(&con->ras_ue_count);
>> +
>> +               if (ce_count != ctx->ras_counter_ce) {
>> +                       ctx->ras_counter_ce = ce_count;
>> +                       out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
>> +               }
>> +
>> +               if (ue_count != ctx->ras_counter_ue) {
>> +                       ctx->ras_counter_ue = ue_count;
>> +                       out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
>> +               }
>> +
>> +               if (!delayed_work_pending(&con->ras_counte_delay_work))
>> +                       schedule_delayed_work(&con->ras_counte_delay_work,
>> +                                 msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS));
>> +       }
>> +
>>         mutex_unlock(&mgr->lock);
>>         return 0;
>>  }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index ed3c43e8b0b5..80f576098318 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -27,6 +27,7 @@
>>  #include <linux/uaccess.h>
>>  #include <linux/reboot.h>
>>  #include <linux/syscalls.h>
>> +#include <linux/pm_runtime.h>
>>
>>  #include "amdgpu.h"
>>  #include "amdgpu_ras.h"
>> @@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
>>                 adev->ras_hw_enabled & amdgpu_ras_mask;
>>  }
>>
>> +static void amdgpu_ras_counte_dw(struct work_struct *work)
>> +{
>> +       struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
>> +                                             ras_counte_delay_work.work);
>> +       struct amdgpu_device *adev = con->adev;
>> +       struct drm_device *dev = &adev->ddev;
>> +       unsigned long ce_count, ue_count;
>> +       int res;
>> +
>> +       res = pm_runtime_get_sync(dev->dev);
>> +       if (res < 0)
>> +               goto Out;
>> +
>> +       /* Cache new values.
>> +        */
>> +       amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
>> +       atomic_set(&con->ras_ce_count, ce_count);
>> +       atomic_set(&con->ras_ue_count, ue_count);
>> +
>> +       pm_runtime_mark_last_busy(dev->dev);
>> +Out:
>> +       pm_runtime_put_autosuspend(dev->dev);
>> +}
>> +
>>  int amdgpu_ras_init(struct amdgpu_device *adev)
>>  {
>>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>>         if (!con)
>>                 return -ENOMEM;
>>
>> +       con->adev = adev;
>> +       INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
>> +       atomic_set(&con->ras_ce_count, 0);
>> +       atomic_set(&con->ras_ue_count, 0);
>> +
>>         con->objs = (struct ras_manager *)(con + 1);
>>
>>         amdgpu_ras_set_context(adev, con);
>> @@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
>>                          struct ras_fs_if *fs_info,
>>                          struct ras_ih_if *ih_info)
>>  {
>> +       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>> +       unsigned long ue_count, ce_count;
>>         int r;
>>
>>         /* disable RAS feature per IP block if it is not supported */
>> @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
>>         if (r)
>>                 goto sysfs;
>>
>> +       /* Those are the cached values at init.
>> +        */
>> +       amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);
>> +       atomic_set(&con->ras_ce_count, ce_count);
>> +       atomic_set(&con->ras_ue_count, ue_count);
>> +
>>         return 0;
>>  cleanup:
>>         amdgpu_ras_sysfs_remove(adev, ras_block);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 10fca0393106..256cea5d34f2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -340,6 +340,11 @@ struct amdgpu_ras {
>>
>>         /* disable ras error count harvest in recovery */
>>         bool disable_ras_err_cnt_harvest;
>> +
>> +       /* RAS count errors delayed work */
>> +       struct delayed_work ras_counte_delay_work;
>> +       atomic_t ras_ue_count;
>> +       atomic_t ras_ce_count;
>>  };
>>
>>  struct ras_fs_data {
>> --
>> 2.31.1.527.g2d677e5b15
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Cluben.tuikov%40amd.com%7C16860b04040649fe81d208d91fc8fb00%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637575770340862619%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=4KIyoPOrvCCC9ljQQhJlxKPhjONiFK%2FEAHNXEc30BtQ%3D&reserved=0