[Intel-xe] [PATCH V9 1/1] drm/xe: Introduce and update counter for low level driver errors
Michal Wajdeczko
michal.wajdeczko at intel.com
Fri Oct 27 13:38:49 UTC 2023
On 27.10.2023 15:02, Upadhyay, Tejas wrote:
>
>
>> -----Original Message-----
>> From: Wajdeczko, Michal <Michal.Wajdeczko at intel.com>
>> Sent: Thursday, October 26, 2023 3:02 AM
>> To: Upadhyay, Tejas <tejas.upadhyay at intel.com>; intel-
>> xe at lists.freedesktop.org
>> Subject: Re: [Intel-xe] [PATCH V9 1/1] drm/xe: Introduce and update counter
>> for low level driver errors
>>
>>
>>
>> On 20.10.2023 12:03, Tejas Upadhyay wrote:
>>> we added a low level driver error counter and incrementing on
>>
>> nit: "added" - stale comment ?
>>
>>> each occurrence. Focus is on errors that are not functionally
>>> affecting the system and might otherwise go unnoticed and cause
>>> power/performance regressions, so checking for the error counters
>>> should help.
>>>
>>> Importantly the intention is not to go adding new error checks, but to
>>> make sure the existing important error conditions are propagated in
>>> terms of counter under respective categories like below :
>>> - GT
>>> - GUC COMMUNICATION
>>> - ENGINE OTHER
>>> - GT OTHER
>>>
>>> - Tile
>>> - GTT
>>> - INTERRUPT
>>>
>>> Currently this is just a counting of errors, later these counters will
>>> be reported through netlink interface when it is implemented and
>>> ready.
>>>
>>> V9:
>>> - Make one patch for API and counter update - Michal
>>> - Remove counter from places where driver load will fail - Michal
>>> - Remove extra \n from logging
>>> - Improve commit message - Aravind/Michal
>>> V8:
>>> - Correct missed ret value handling
>>> V7:
>>> - removed double couting of err - Michal
>>> V6:
>>> - move drm_err to gt and tile specific err API - Aravind
>>> - Use GTT naming instead of GGTT - Aravind/Niranjana
>>> V5:
>>> - Dump err_type in string format
>>> V4:
>>> - dump err_type in drm_err log - Himal
>>> V2:
>>> - Use modified APIs
>>>
>>> Signed-off-by: Tejas Upadhyay <tejas.upadhyay at intel.com>
>>> ---
>>> drivers/gpu/drm/xe/xe_device_types.h | 15 +++++++
>>> drivers/gpu/drm/xe/xe_gt.c | 41 +++++++++++++++++++
>>> drivers/gpu/drm/xe/xe_gt.h | 4 ++
>>> drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 13 +++---
>>> drivers/gpu/drm/xe/xe_gt_types.h | 17 ++++++++
>>> drivers/gpu/drm/xe/xe_guc.c | 9 ++---
>>> drivers/gpu/drm/xe/xe_guc_ct.c | 44 +++++++++++----------
>>> drivers/gpu/drm/xe/xe_guc_pc.c | 12 ++++--
>>> drivers/gpu/drm/xe/xe_guc_submit.c | 5 ++-
>>> drivers/gpu/drm/xe/xe_irq.c | 6 ++-
>>> drivers/gpu/drm/xe/xe_reg_sr.c | 22 +++++------
>>> drivers/gpu/drm/xe/xe_tile.c | 41 +++++++++++++++++++
>>> drivers/gpu/drm/xe/xe_tile.h | 3 ++
>>> 13 files changed, 183 insertions(+), 49 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h
>>> b/drivers/gpu/drm/xe/xe_device_types.h
>>> index 44d622d4cc3a..da88e69beeaf 100644
>>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>>> @@ -61,6 +61,18 @@ struct xe_pat_ops;
>>> const struct xe_tile * : (const struct xe_device *)((tile__)->xe),
>> \
>>> struct xe_tile * : (tile__)->xe)
>>>
>>> +/** @enum xe_tile_drv_err_type
>>
>> this doesn't look like a well formatted kernel-doc same with enum values
>> below
>>
>>> + * enum for different types of tile level errors in driver */ enum
>>> +xe_tile_drv_err_type {
>>> + /* Error type for all PPGTT and GTT errors */
>>> + XE_TILE_DRV_ERR_GTT,
>>> + /* Interrupt errors */
>>> + XE_TILE_DRV_ERR_INTR,
>>> + /* Max defined error types, keep this last */
>>
>> nit: better:
>>
>> /* private: number of defined error types, keep this last */
>>
>>> + __XE_TILE_DRV_ERR_MAX
>>> +};
>>> +
>>> /**
>>> * struct xe_mem_region - memory region structure
>>> * This is used to describe a memory region in xe @@ -190,6 +202,9 @@
>>> struct xe_tile {
>>>
>>> /** @sysfs: sysfs' kobj used by xe_tile_sysfs */
>>> struct kobject *sysfs;
>>> +
>>> + /** @drv_err_cnt: driver error counter for this tile */
>>> + u32 drv_err_cnt[__XE_TILE_DRV_ERR_MAX];
>>> };
>>>
>>> /**
>>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>>> index 74e1f47bd401..993f2a9240ae 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>>> @@ -48,6 +48,47 @@
>>> #include "xe_wa.h"
>>> #include "xe_wopcm.h"
>>>
>>> +static const char *const xe_gt_drv_err_to_str[] = {
>>> + [XE_GT_DRV_ERR_GUC_COMM] = "GUC COMMUNICATION",
>>> + [XE_GT_DRV_ERR_ENGINE] = "ENGINE OTHER",
>>> + [XE_GT_DRV_ERR_OTHERS] = "GT OTHER"
>>> +};
>>> +
>>> +/**
>>> + * xe_gt_report_driver_error - Count driver error for gt
>>
>> s/gt/GT
>>
>>> + * @gt: GT to count error for
>>> + * @err: enum error type
>>> + * @fmt: debug message format to print error
>>> + * @...: variable args to print error
>>> + *
>>> + * Increment the driver error counter in respective error
>>> + * category for this GT.
>>> + *
>>> + * Return: void.
>>> + */
>>> +void xe_gt_report_driver_error(struct xe_gt *gt,
>>> + const enum xe_gt_drv_err_type err,
>>> + const char *fmt, ...)
>>> +{
>>> + struct va_format vaf;
>>> + va_list args;
>>> +
>>> + BUILD_BUG_ON(ARRAY_SIZE(xe_gt_drv_err_to_str) !=
>>> + __XE_GT_DRV_ERR_MAX);
>>> +
>>> + xe_gt_assert(gt, err >= 0);
>>> + xe_gt_assert(gt, err < __XE_GT_DRV_ERR_MAX);
>>> + WRITE_ONCE(gt->drv_err_cnt[err],
>>> + READ_ONCE(gt->drv_err_cnt[err]) + 1);
>>> +
>>> + va_start(args, fmt);
>>> + vaf.fmt = fmt;
>>> + vaf.va = &args;
>>> +
>>> + xe_gt_err(gt, "[%s] %pV\n", xe_gt_drv_err_to_str[err], &vaf);
>>> + va_end(args);
>>> +}
>>> +
>>> struct xe_gt *xe_gt_alloc(struct xe_tile *tile) {
>>> struct xe_gt *gt;
>>> diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
>>> index caded203a8a0..efd83707b367 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt.h
>>> +++ b/drivers/gpu/drm/xe/xe_gt.h
>>> @@ -67,4 +67,8 @@ static inline bool xe_gt_is_usm_hwe(struct xe_gt *gt,
>> struct xe_hw_engine *hwe)
>>> hwe->instance == gt->usm.reserved_bcs_instance; }
>>>
>>> +void xe_gt_report_driver_error(struct xe_gt *gt,
>>> + const enum xe_gt_drv_err_type err,
>>> + const char *fmt, ...);
>>> +
>>> #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>> b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>> index bd6005b9d498..6af2f7b01f7b 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>> +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
>>> @@ -9,6 +9,7 @@
>>> #include "xe_gt.h"
>>> #include "xe_guc.h"
>>> #include "xe_guc_ct.h"
>>> +#include "xe_tile.h"
>>> #include "xe_trace.h"
>>>
>>> #define TLB_TIMEOUT (HZ / 4)
>>> @@ -35,8 +36,10 @@ static void xe_gt_tlb_fence_timeout(struct
>> work_struct *work)
>>> break;
>>>
>>> trace_xe_gt_tlb_invalidation_fence_timeout(fence);
>>> - drm_err(>_to_xe(gt)->drm, "gt%d: TLB invalidation fence
>> timeout, seqno=%d recv=%d",
>>> - gt->info.id, fence->seqno, gt-
>>> tlb_invalidation.seqno_recv);
>>> + xe_tile_report_driver_error(gt_to_tile(gt),
>> XE_TILE_DRV_ERR_GTT,
>>> + "GT%u: TLB invalidation fence
>> timeout, seqno=%d recv=%d",
>>
>> this message
>>
>>> + gt->info.id, fence->seqno,
>>> + gt->tlb_invalidation.seqno_recv);
>>>
>>> list_del(&fence->link);
>>> fence->base.error = -ETIME;
>>> @@ -317,7 +320,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
>>> */
>>> int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno) {
>>> - struct xe_device *xe = gt_to_xe(gt);
>>> struct xe_guc *guc = >->uc.guc;
>>> int ret;
>>>
>>> @@ -329,8 +331,9 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int
>> seqno)
>>> tlb_invalidation_seqno_past(gt, seqno),
>>> TLB_TIMEOUT);
>>> if (!ret) {
>>> - drm_err(&xe->drm, "gt%d: TLB invalidation time'd out,
>> seqno=%d, recv=%d\n",
>>> - gt->info.id, seqno, gt->tlb_invalidation.seqno_recv);
>>> + xe_tile_report_driver_error(gt_to_tile(gt),
>> XE_TILE_DRV_ERR_GTT,
>>> + "GT%u: TLB invalidation time'd out,
>> seqno=%d, recv=%d",
>>
>> and this message are about the same thing but are little different
>>
>> can we print that from one place (using helper):
>>
>> static void tlb_seqno_error(gt, seq)
>> {
>> xe_tile_report_driver_error(
>> gt_to_tile(gt),
>> XE_TILE_DRV_ERR_GTT,
>> "GT%u: TLB invalidation failed for seqno=%d,
>> last received=%d",
>> seqno,
>> gt->tlb_invalidation.seqno_recv);
>> }
>>
>> or at least unify the message?
>
> TLB invalidation seqno and fence seqno are different, atleast we need those differentiation in error messages so it looks ok in my opinion.
hmm, are you sure these are different ?
see send_tlb_invalidation():
seqno = gt->tlb_invalidation.seqno;
...
fence->seqno = seqno;
>
> Thanks,
> Tejas
>>
>>> + gt->info.id, seqno, gt-
>>> tlb_invalidation.seqno_recv);
>>> return -ETIME;
>>> }
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_gt_types.h
>>> b/drivers/gpu/drm/xe/xe_gt_types.h
>>> index d3f2793684e2..6a6609162b3d 100644
>>> --- a/drivers/gpu/drm/xe/xe_gt_types.h
>>> +++ b/drivers/gpu/drm/xe/xe_gt_types.h
>>> @@ -24,6 +24,20 @@ enum xe_gt_type {
>>> XE_GT_TYPE_MEDIA,
>>> };
>>>
>>> +/** @enum xe_gt_drv_err_type
>>> + * enum for different types of gt level errors in driver
>>
>> ditto, not a proper kernel-doc
>>
>>> + */
>>> +enum xe_gt_drv_err_type {
>>> + /* Driver guc communication errors */
>>> + XE_GT_DRV_ERR_GUC_COMM,
>>> + /* Engine execution errors */
>>> + XE_GT_DRV_ERR_ENGINE,
>>> + /* Other errors like error during save/restore registers */
>>> + XE_GT_DRV_ERR_OTHERS,
>>> + /* Max defined error types, keep this last */
>>> + __XE_GT_DRV_ERR_MAX
>>> +};
>>> +
>>> #define XE_MAX_DSS_FUSE_REGS 3
>>> #define XE_MAX_EU_FUSE_REGS 1
>>>
>>> @@ -347,6 +361,9 @@ struct xe_gt {
>>> /** @oob: bitmap with active OOB workaroudns */
>>> unsigned long *oob;
>>> } wa_active;
>>> +
>>> + /** @drv_err_cnt: driver error counter for this GT */
>>> + u32 drv_err_cnt[__XE_GT_DRV_ERR_MAX];
>>> };
>>>
>>> #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
>>> index 84f0b5488783..aeb9cf814440 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc.c
>>> @@ -663,8 +663,7 @@ int xe_guc_mmio_send_recv(struct xe_guc *guc,
>> const u32 *request,
>>> 50000, &reply, false);
>>> if (ret) {
>>> timeout:
>>> - drm_err(&xe->drm, "mmio request %#x: no reply %#x\n",
>>> - request[0], reply);
>>> + xe_gt_err(gt, "mmio request %#x: no reply %#x\n", request[0],
>>> +reply);
>>
>> good but unrelated change for this series
>>
>>> return ret;
>>> }
>>>
>>> @@ -697,16 +696,14 @@ int xe_guc_mmio_send_recv(struct xe_guc *guc,
>> const u32 *request,
>>> u32 hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT,
>> header);
>>> u32 error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR,
>> header);
>>>
>>> - drm_err(&xe->drm, "mmio request %#x: failure %#x/%#x\n",
>>> - request[0], error, hint);
>>> + xe_gt_err(gt, "mmio request %#x: failure %#x/%#x\n",
>> request[0],
>>> +error, hint);
>>
>> ditto
>>
>>> return -ENXIO;
>>> }
>>>
>>> if (FIELD_GET(GUC_HXG_MSG_0_TYPE, header) !=
>>> GUC_HXG_TYPE_RESPONSE_SUCCESS) {
>>> proto:
>>> - drm_err(&xe->drm, "mmio request %#x: unexpected reply
>> %#x\n",
>>> - request[0], header);
>>> + xe_gt_err(gt, "mmio request %#x: unexpected reply %#x\n",
>>> +request[0], header);
>>
>> ditto
>>
>>> return -EPROTO;
>>> }
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c
>>> b/drivers/gpu/drm/xe/xe_guc_ct.c index 8b686c8b3339..52dddf23a3f7
>>> 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_ct.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
>>> @@ -732,8 +732,9 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct,
>>> const u32 *action, u32 len,
>>>
>>> ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ);
>>> if (!ret) {
>>
>> this
>>
>>> - drm_err(&xe->drm, "Timed out wait for G2H, fence %u,
>> action %04x",
>>> - g2h_fence.seqno, action[0]);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "Timed out wait for G2H, fence %u,
>> action %04x",
>>> + g2h_fence.seqno, action[0]);
>>> xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno);
>>> return -ETIME;
>>> }
>>> @@ -744,8 +745,9 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct,
>> const u32 *action, u32 len,
>>> goto retry;
>>> }
>>> if (g2h_fence.fail) {
>>
>> and this are just two points where guc_ct_send_recv() prints the error
>> message on failure, but actually there are few more error returns that were
>> just not covered with any diagnostic message.
>>
>> so maybe instead of partially covering only selected failures, better option will
>> be to report GUC_COMM_ERROR once at single exit point or at the next level
>> caller function (still inside CTB component) ?
>>
>> ret = guc_ct_send_recv(...)
>> if (ret < 0)
>> xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>> "CTB communication failed on send (%pe)",
>> ERR_PTR(ret));
>> return ret;
>>
>>> - drm_err(&xe->drm, "Send failed, action 0x%04x, error %d,
>> hint %d",
>>> - action[0], g2h_fence.error, g2h_fence.hint);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "Send failed, action 0x%04x, error
>> %d, hint %d",
>>> + action[0], g2h_fence.error,
>> g2h_fence.hint);
>>> ret = -EIO;
>>> }
>>>
>>> @@ -829,7 +831,6 @@ static int parse_g2h_response(struct xe_guc_ct
>>> *ct, u32 *msg, u32 len)
>>>
>>> static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len) {
>>> - struct xe_device *xe = ct_to_xe(ct);
>>> u32 hxg, origin, type;
>>> int ret;
>>>
>>> @@ -839,9 +840,9 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32
>>> *msg, u32 len)
>>>
>>> origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg);
>>> if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) {
>>> - drm_err(&xe->drm,
>>> - "G2H channel broken on read, origin=%d, reset
>> required\n",
>>> - origin);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "G2H channel broken on read,
>> origin=%d, reset required",
>>> + origin);
>>
>> CTB channel can be also broken on send, but error reporting was missing
>> there, so maybe again, one place to report all errors on read:
>>
>> ret = dequeue_one_g2h(...)
>> if (ret < 0)
>> xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>> "CTB communication failed on read (%pe)",
>> ERR_PTR(ret));
>>
>>> ct->ctbs.g2h.info.broken = true;
>>>
>>> return -EPROTO;
>>> @@ -858,9 +859,9 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32
>> *msg, u32 len)
>>> ret = parse_g2h_response(ct, msg, len);
>>> break;
>>> default:
>>> - drm_err(&xe->drm,
>>> - "G2H channel broken on read, type=%d, reset
>> required\n",
>>> - type);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "G2H channel broken on read,
>> type=%d, reset required",
>>
>> this was wrong message, it needs to be updated separately to
>>
>> "Unexpected G2H message type %#x\n",
>>
>> also, I'm not sure that this warrants a reset, but that's other story
>>
>>> + type);
>>> ct->ctbs.g2h.info.broken = true;
>>>
>>> ret = -EOPNOTSUPP;
>>> @@ -871,7 +872,6 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32
>>> *msg, u32 len)
>>>
>>> static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
>>> {
>>> - struct xe_device *xe = ct_to_xe(ct);
>>> struct xe_guc *guc = ct_to_guc(ct);
>>> u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1]);
>>> u32 *payload = msg + GUC_CTB_HXG_MSG_MIN_LEN; @@ -918,12
>> +918,15 @@
>>> static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
>>> adj_len);
>>> break;
>>> default:
>>> - drm_err(&xe->drm, "unexpected action 0x%04x\n", action);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "unexpected action 0x%04x",
>>> + action);
>>
>> it seems that there is a bug here, we should have here -EOPNOTSUPP" so it
>> would fall into below error reporting, but as said above, maybe better to have
>> one place to report any error on read (we will avoid any duplicate reports and
>> also make sure to catch all issues)
>>
>>> }
>>>
>>> if (ret)
>>> - drm_err(&xe->drm, "action 0x%04x failed processing,
>> ret=%d\n",
>>> - action, ret);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "action 0x%04x failed processing,
>> ret=%d",
>>> + action, ret);
>>>
>>> return 0;
>>> }
>>> @@ -957,9 +960,9 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg,
>> bool fast_path)
>>> sizeof(u32));
>>> len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) +
>> GUC_CTB_MSG_MIN_LEN;
>>> if (len > avail) {
>>> - drm_err(&xe->drm,
>>> - "G2H channel broken on read, avail=%d, len=%d,
>> reset required\n",
>>> - avail, len);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "G2H channel broken on read,
>> avail=%d, len=%d, reset required",
>>> + avail, len);
>>> g2h->info.broken = true;
>>>
>>> return -EPROTO;
>>> @@ -1027,8 +1030,9 @@ static void g2h_fast_path(struct xe_guc_ct *ct, u32
>> *msg, u32 len)
>>> }
>>>
>>> if (ret)
>>> - drm_err(&xe->drm, "action 0x%04x failed processing,
>> ret=%d\n",
>>> - action, ret);
>>> + xe_gt_report_driver_error(ct_to_gt(ct),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "action 0x%04x failed processing,
>> ret=%d",
>>> + action, ret);
>>> }
>>>
>>> /**
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c
>>> b/drivers/gpu/drm/xe/xe_guc_pc.c index d9375d1d582f..137c7ce44f47
>>> 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_pc.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_pc.c
>>> @@ -218,9 +218,15 @@ static int pc_action_set_param(struct xe_guc_pc
>> *pc, u8 id, u32 value)
>>> return -EAGAIN;
>>>
>>> ret = xe_guc_ct_send(ct, action, ARRAY_SIZE(action), 0, 0);
>>> - if (ret)
>>> - drm_err(&pc_to_xe(pc)->drm, "GuC PC set param failed:
>> %pe",
>>> - ERR_PTR(ret));
>>> + if (ret) {
>>> + /* GT reset already triggered due to deadlock, lets not report
>> this error */
>>> + if (ret != -EDEADLK)
>>> + xe_gt_report_driver_error(pc_to_gt(pc),
>> XE_GT_DRV_ERR_GUC_COMM,
>>> + "GuC PC set param failed:
>> %pe",
>>> + ERR_PTR(ret));
>>> + else
>>> + xe_gt_err(pc_to_gt(pc), "GuC PC set param failed:
>> %pe\n",
>>> +ERR_PTR(ret));
>>
>> this selective reporting makes things unreliable and error prone (ouch!)
>>
>> note that you still didn't avoid duplication, as this error was reported as
>> xe_gt_report_driver_error("Send failed, action 0x%04x, ...")
>>
>> IMO better to report GuC comm send/recv errors in one place (in CTB layer)
>>
>> btw, since you care about "power/performance regressions" then maybe
>> errors from GUC_PC deserve it's own category for easier diagnostics ?
>>
>>> + }
>>>
>>> return ret;
>>> }
>>> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
>>> b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> index 870dc5c532fa..283832d518b1 100644
>>> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
>>> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>>> @@ -1681,8 +1681,9 @@ int
>> xe_guc_exec_queue_reset_failure_handler(struct xe_guc *guc, u32 *msg, u32
>> le
>>> reason = msg[2];
>>>
>>> /* Unexpected failure of a hardware feature, log an actual error */
>>> - drm_err(&xe->drm, "GuC engine reset request failed on %d:%d
>> because 0x%08X",
>>> - guc_class, instance, reason);
>>> + xe_gt_report_driver_error(guc_to_gt(guc), XE_GT_DRV_ERR_ENGINE,
>>> + "GuC engine reset request failed on %d:%d
>> because 0x%08X",
>>> + guc_class, instance, reason);
>>>
>>> xe_gt_reset_async(guc_to_gt(guc));
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 61350ed32c61..2b20582b0425 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -18,6 +18,7 @@
>>> #include "xe_guc.h"
>>> #include "xe_hw_engine.h"
>>> #include "xe_mmio.h"
>>> +#include "xe_tile.h"
>>>
>>> /*
>>> * Interrupt registers for a unit are always consecutive and ordered
>>> @@ -227,8 +228,9 @@ gt_engine_identity(struct xe_device *xe,
>>> !time_after32(local_clock() >> 10, timeout_ts));
>>>
>>> if (unlikely(!(ident & INTR_DATA_VALID))) {
>>> - drm_err(&xe->drm, "INTR_IDENTITY_REG%u:%u 0x%08x not
>> valid!\n",
>>> - bank, bit, ident);
>>> + xe_tile_report_driver_error(gt_to_tile(mmio),
>> XE_TILE_DRV_ERR_INTR,
>>> + "INTR_IDENTITY_REG%u:%u
>> 0x%08x not valid!",
>>> + bank, bit, ident);
>>> return 0;
>>> }
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_reg_sr.c
>>> b/drivers/gpu/drm/xe/xe_reg_sr.c index 87adefb56024..f2e20e10d927
>>> 100644
>>> --- a/drivers/gpu/drm/xe/xe_reg_sr.c
>>> +++ b/drivers/gpu/drm/xe/xe_reg_sr.c
>>> @@ -125,12 +125,12 @@ int xe_reg_sr_add(struct xe_reg_sr *sr,
>>> return 0;
>>>
>>> fail:
>>> - xe_gt_err(gt,
>>> - "discarding save-restore reg %04lx (clear: %08x, set: %08x,
>> masked: %s, mcr: %s): ret=%d\n",
>>> - idx, e->clr_bits, e->set_bits,
>>> - str_yes_no(e->reg.masked),
>>> - str_yes_no(e->reg.mcr),
>>> - ret);
>>> + xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS,
>>> + "discarding save-restore reg %04lx (clear:
>> %08x, set: %08x, masked: %s, mcr: %s): ret=%d",
>>> + idx, e->clr_bits, e->set_bits,
>>> + str_yes_no(e->reg.masked),
>>> + str_yes_no(e->reg.mcr),
>>> + ret);
>>> reg_sr_inc_error(sr);
>>>
>>> return ret;
>>> @@ -207,7 +207,7 @@ void xe_reg_sr_apply_mmio(struct xe_reg_sr *sr,
>> struct xe_gt *gt)
>>> return;
>>>
>>> err_force_wake:
>>> - xe_gt_err(gt, "Failed to apply, err=%d\n", err);
>>> + xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS, "Failed to
>>> +apply, err=%d", err);
>>> }
>>>
>>> void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe) @@ -234,9
>>> +234,9 @@ void xe_reg_sr_apply_whitelist(struct xe_hw_engine *hwe)
>>> p = drm_debug_printer(KBUILD_MODNAME);
>>> xa_for_each(&sr->xa, reg, entry) {
>>> if (slot == RING_MAX_NONPRIV_SLOTS) {
>>> - xe_gt_err(gt,
>>> - "hwe %s: maximum register whitelist slots
>> (%d) reached, refusing to add more\n",
>>> - hwe->name, RING_MAX_NONPRIV_SLOTS);
>>> + xe_gt_report_driver_error(gt,
>> XE_GT_DRV_ERR_ENGINE,
>>> + "hwe %s: maximum register
>> whitelist slots (%d) reached, refusing to add more",
>>> + hwe->name,
>> RING_MAX_NONPRIV_SLOTS);
>>> break;
>>> }
>>>
>>> @@ -259,7 +259,7 @@ void xe_reg_sr_apply_whitelist(struct xe_hw_engine
>> *hwe)
>>> return;
>>>
>>> err_force_wake:
>>> - drm_err(&xe->drm, "Failed to apply, err=%d\n", err);
>>> + xe_gt_report_driver_error(gt, XE_GT_DRV_ERR_OTHERS, "Failed to
>>> +apply, err=%d", err);
>>> }
>>>
>>> /**
>>> diff --git a/drivers/gpu/drm/xe/xe_tile.c
>>> b/drivers/gpu/drm/xe/xe_tile.c index 131752a57f65..364ec9628a41 100644
>>> --- a/drivers/gpu/drm/xe/xe_tile.c
>>> +++ b/drivers/gpu/drm/xe/xe_tile.c
>>> @@ -71,6 +71,47 @@
>>> * - MOCS and PAT programming
>>> */
>>>
>>> +static const char *const xe_tile_drv_err_to_str[] = {
>>> + [XE_TILE_DRV_ERR_GTT] = "GTT",
>>> + [XE_TILE_DRV_ERR_INTR] = "INTERRUPT"
>>> +};
>>> +
>>> +/**
>>> + * xe_tile_report_driver_error - Count driver error for tile
>>> + * @tile: Tile to count error for
>>> + * @err: Enum error type
>>> + * @fmt: debug message format to print error
>>> + * @...: variable args to print error
>>> + *
>>> + * Increment the driver error counter in respective error
>>> + * category for this tile.
>>> + *
>>> + * Return: void.
>>> + */
>>> +void xe_tile_report_driver_error(struct xe_tile *tile,
>>> + const enum xe_tile_drv_err_type err,
>>> + const char *fmt, ...)
>>> +{
>>> + struct va_format vaf;
>>> + va_list args;
>>> +
>>> + BUILD_BUG_ON(ARRAY_SIZE(xe_tile_drv_err_to_str) !=
>>> + __XE_TILE_DRV_ERR_MAX);
>>> +
>>> + xe_assert(tile_to_xe(tile), err >= 0);
>>> + xe_assert(tile_to_xe(tile), err < __XE_TILE_DRV_ERR_MAX);
>>
>> you can use xe_tile_assert() here
>>
>>> + WRITE_ONCE(tile->drv_err_cnt[err],
>>> + READ_ONCE(tile->drv_err_cnt[err]) + 1);
>>> +
>>> + va_start(args, fmt);
>>> + vaf.fmt = fmt;
>>> + vaf.va = &args;
>>> +
>>> + drm_err(&tile->xe->drm, "TILE%u [%s] %pV\n",
>>> + tile->id, xe_tile_drv_err_to_str[err], &vaf);
>>> + va_end(args);
>>> +}
>>> +
>>> /**
>>> * xe_tile_alloc - Perform per-tile memory allocation
>>> * @tile: Tile to perform allocations for diff --git
>>> a/drivers/gpu/drm/xe/xe_tile.h b/drivers/gpu/drm/xe/xe_tile.h index
>>> 782c47f8bd45..c79108fb9579 100644
>>> --- a/drivers/gpu/drm/xe/xe_tile.h
>>> +++ b/drivers/gpu/drm/xe/xe_tile.h
>>> @@ -14,5 +14,8 @@ int xe_tile_alloc(struct xe_tile *tile); int
>>> xe_tile_init_noalloc(struct xe_tile *tile);
>>>
>>> void xe_tile_migrate_wait(struct xe_tile *tile);
>>> +void xe_tile_report_driver_error(struct xe_tile *tile,
>>> + const enum xe_tile_drv_err_type err,
>>> + const char *fmt, ...);
>>>
>>> #endif
More information about the Intel-xe
mailing list