[Intel-xe] [PATCH v3 06/11] drm/xe: Notify userspace about GSC HW errors.

Fri Oct 20 03:38:34 UTC 2023

On 19-10-2023 22:34, Welty, Brian wrote:
>
>
> On 10/19/2023 6:25 AM, Himal Prasad Ghimiray wrote:
>> Send uevent incase of nonfatal errors reported by gsc.
>>
>> TODO: Since XE_RESET_FAILED_UEVENT and XE_GSC_HW_HEALTH_UEVENT
>> both pass DEVICE_STATUS=NEEDS_RESET. Use same Uevent to determine
>> the RESET_REQUIRED and pass reasons for reset required as additional
>> info.
>>
>> v2
>> - No need to provide tile info in uevent because error is
>> accepted only in tile0. (Aravind)
>>
>> v3
>> - Use pci node to send uevent.
>> - Pass RESET_REASON for uevent.
>
> Can we go a step farther as Aravind was saying and just have one 
> single uevent?  Or maybe you have one uevent now, but can still 
> consolidate a bit more.
> So replace XE_GSC_HW_HEALTH_UEVENT and XE_RESET_FAILED_UEVENT with one 
> uevent, defined in uapi header with:
>
> #define XE_RESET_REQUIRED_UEVENT        "RESET_REQUIRED"
> #define XE_RESET_REQUIRED_UEVENT_REASON_CSC "REASON=CSC_HW_ERROR"
> #define XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED"

Hi Brian,

While implementing XE_RESET_FAILED_UEVENT, the recommendation was to use

"DEVICE_STATUS=NEEDS_RESET" instead of "RESET_REQUIRED.

Will be proceeding with:

#define XE_RESET_REQUIRED_UEVENT        "DEVICE_STATUS=NEEDS_RESET"
#define XE_RESET_REQUIRED_UEVENT_REASON_CSC "REASON=CSC_HW_ERROR"
#define XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED"

Thanks for the review comments, will be updating it in next patch.

BR
Himal

>
> Would above work?
> -Brian
>
>
>>
>> Cc: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
>> Cc: Brian Welty <brian.welty at intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_device_types.h |  3 +++
>>   drivers/gpu/drm/xe/xe_hw_error.c     | 17 +++++++++++++++++
>>   drivers/gpu/drm/xe/xe_hw_error.h     |  3 ++-
>>   drivers/gpu/drm/xe/xe_irq.c          |  4 ++++
>>   include/uapi/drm/xe_drm.h            |  9 +++++++++
>>   5 files changed, 35 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
>> b/drivers/gpu/drm/xe/xe_device_types.h
>> index 3b371e0f6168..bc7b545ce37b 100644
>> --- a/drivers/gpu/drm/xe/xe_device_types.h
>> +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> @@ -196,6 +196,9 @@ struct xe_tile {
>>       struct tile_hw_errors {
>>           struct xarray hw_error;
>>       } errors;
>> +
>> +    /** @gsc_hw_err_work: worker for uevent to report GSC HW errors */
>> +    struct work_struct gsc_hw_err_work;
>>   };
>>     /**
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c 
>> b/drivers/gpu/drm/xe/xe_hw_error.c
>> index 84533ef6a51e..644e63d4d717 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -3,6 +3,8 @@
>>    * Copyright © 2023 Intel Corporation
>>    */
>>   +#include <drm/xe_drm.h>
>> +
>>   #include "xe_hw_error.h"
>>     #include "regs/xe_regs.h"
>> @@ -378,6 +380,20 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const 
>> enum hardware_error hw_err)
>>           xe_gt_hw_error_log_vector_reg(gt, hw_err);
>>   }
>>   +void xe_gsc_hw_error_work(struct work_struct *work)
>> +{
>> +    struct xe_tile *tile = container_of(work, typeof(*tile), 
>> gsc_hw_err_work);
>> +    struct pci_dev *pdev = to_pci_dev(tile_to_xe(tile)->drm.dev);
>> +    char *csc_hw_error_event[3];
>> +
>> +    csc_hw_error_event[0] = XE_GSC_HW_HEALTH_UEVENT "=1";
>> +    csc_hw_error_event[1] = "RESET_REASON=CSC_HW_ERROR";
>> +    csc_hw_error_event[2] = NULL;
>> +
>> +    kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE,
>> +               csc_hw_error_event);
>> +}
>> +
>>   static void
>>   xe_gsc_hw_error_handler(struct xe_tile *tile, const enum 
>> hardware_error hw_err)
>>   {
>> @@ -435,6 +451,7 @@ xe_gsc_hw_error_handler(struct xe_tile *tile, 
>> const enum hardware_error hw_err)
>>               drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR
>>                           "Tile0 reported GSC %s %s error, bit[%d] is 
>> set\n",
>>                           name, hw_err_str, errbit);
>> +            schedule_work(&tile->gsc_hw_err_work);
>>           }
>>           if (indx != XE_HW_ERR_TILE_UNSPEC)
>> xe_update_hw_error_cnt(&tile_to_xe(tile)->drm,
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h 
>> b/drivers/gpu/drm/xe/xe_hw_error.h
>> index 997747b6b603..3c02fbd3b256 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -7,6 +7,7 @@
>>     #include <linux/stddef.h>
>>   #include <linux/types.h>
>> +#include <linux/workqueue.h>
>>     #define XE_RAS_REG_SIZE 32
>>   @@ -106,5 +107,5 @@ struct xe_tile;
>>   void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 
>> master_ctl);
>>   void xe_assign_hw_err_regs(struct xe_device *xe);
>>   void xe_process_hw_errors(struct xe_device *xe);
>> -
>> +void xe_gsc_hw_error_work(struct work_struct *work);
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>> index 8365a4cb0c45..bc0f01a2abc1 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -614,6 +614,10 @@ int xe_irq_install(struct xe_device *xe)
>>       irq_handler_t irq_handler;
>>       int err, irq;
>>   +    struct xe_tile *tile = xe_device_get_root_tile(xe);
>> +
>> +    INIT_WORK(&tile->gsc_hw_err_work, xe_gsc_hw_error_work);
>> +
>>       irq_handler = xe_irq_handler(xe);
>>       if (!irq_handler) {
>>           drm_err(&xe->drm, "No supported interrupt handler");
>> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>> index 24bf8f0f52e8..4e5e57bc9f69 100644
>> --- a/include/uapi/drm/xe_drm.h
>> +++ b/include/uapi/drm/xe_drm.h
>> @@ -16,6 +16,15 @@ extern "C" {
>>    * subject to backwards-compatibility constraints.
>>    */
>>   +/**
>> + * DOC: uevent generated by xe on it's pci node.
>> + *
>> + * XE_GSC_HW_HEALTH_UEVENT - Event is generated when GSC reports HW
>> + * errors. The value supplied with the event is always "NEEDS_RESET"
>> + * Additional information supplied is "RESET_REASON=CSC_HW_ERROR".
>> + */
>> +#define XE_GSC_HW_HEALTH_UEVENT "DEVICE_STATUS"
>> +
>>   /**
>>    * DOC: uevent generated by xe on it's pci node.
>>    *
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/intel-xe/attachments/20231020/1145e4b0/attachment-0001.htm>