[Intel-xe] [PATCH 11/11] drm/xe: Clear all SoC errors post warm reset.

Thu Oct 12 03:05:18 UTC 2023

On 11/10/23 12:29, Ghimiray, Himal Prasad wrote:
>
> On 11-10-2023 12:26, Aravind Iddamsetty wrote:
>> On 27/09/23 17:16, Himal Prasad Ghimiray wrote:
>>> There are scenarios where there are no fatal errors reported
>>> but Non-fatal/correctable errors being reported from the SoC
>>> uncore to IEH and not propogated to SG unit. Clear all previous
>>> SoC errors post warm reset.
>> the commit msg is not very clear, how fatal error reporting is related to other errors.
> Will rephrase it as
>
> There are scenarios where there are errors being reported from the SoC
> uncore to IEH and not propagated to SG unit. Since these errors are not propagated to SG unit,
> driver wont be able to clean them as part of xe_process_hw_error. Hence clear all SoC register post
> xe_process_hw_error.
>
the scenario this patch address is only during boot up so let's mention that.

Thanks,

Aravind.

> Is it ok ?
>
>>
>> Thanks,
>> Aravind.
>>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>> ---
>>>   drivers/gpu/drm/xe/xe_hw_error.c | 37 ++++++++++++++++++++++++++++++++
>>>   drivers/gpu/drm/xe/xe_hw_error.h |  1 +
>>>   drivers/gpu/drm/xe/xe_irq.c      |  1 +
>>>   3 files changed, 39 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>>> index 0bcb1bea7ffb..a777c887a7be 100644
>>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>>> @@ -366,6 +366,43 @@ static void xe_assign_hw_err_regs(struct xe_device *xe)
>>>       }
>>>   }
>>>   +void xe_clear_all_soc_errors(struct xe_device *xe)
>>> +{
>>> +    enum hardware_error hw_err;
>>> +    u32 base, slave_base;
>>> +    struct xe_tile *tile;
>>> +    struct xe_gt *gt;
>>> +    unsigned int i;
>>> +
>>> +    base = SOC_PVC_BASE;
>>> +    slave_base = SOC_PVC_SLAVE_BASE;
>>> +
>>> +    hw_err = HARDWARE_ERROR_CORRECTABLE;
>>> +
>>> +    for_each_tile(tile, xe, i) {
>>> +        gt = tile->primary_gt;
>>> +
>>> +        while (hw_err < HARDWARE_ERROR_MAX) {
>>> +            for (i = 0; i < PVC_NUM_IEH; i++)
>>> +                xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
>>> +                        ~REG_BIT(hw_err));
>>> +
>>> +            xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err),
>>> +                    REG_GENMASK(31, 0));
>>> +            xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err),
>>> +                    REG_GENMASK(31, 0));
>>> +            xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
>>> +                    REG_GENMASK(31, 0));
>>> +            xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
>>> +                    REG_GENMASK(31, 0));
>>> +            hw_err++;
>>> +        }
>>> +        for (i = 0; i < PVC_NUM_IEH; i++)
>>> +            xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
>>> +                    (HARDWARE_ERROR_MAX << 1) + 1);
>>> +    }
>>> +}
>>> +
>>>   static void
>>>   xe_gt_hw_error_status_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>>>   {
>>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>>> index a458a90b34a2..7ada7c97c939 100644
>>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>>> @@ -219,4 +219,5 @@ struct xe_tile;
>>>   void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
>>>   void xe_process_hw_errors(struct xe_device *xe);
>>>   void xe_gsc_hw_error_work(struct work_struct *work);
>>> +void xe_clear_all_soc_errors(struct xe_device *xe);
>>>   #endif
>>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>>> index 285c657cc789..42a6bb45acba 100644
>>> --- a/drivers/gpu/drm/xe/xe_irq.c
>>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>>> @@ -597,6 +597,7 @@ int xe_irq_install(struct xe_device *xe)
>>>       }
>>>         xe_process_hw_errors(xe);
>>> +    xe_clear_all_soc_errors(xe);
>>>         xe->irq.enabled = true;
>>>