[Intel-xe] [PATCH 11/11] drm/xe: Clear all SoC errors post warm reset.
Aravind Iddamsetty
aravind.iddamsetty at linux.intel.com
Wed Oct 11 06:56:13 UTC 2023
On 27/09/23 17:16, Himal Prasad Ghimiray wrote:
> There are scenarios where there are no fatal errors reported
> but Non-fatal/correctable errors being reported from the SoC
> uncore to IEH and not propogated to SG unit. Clear all previous
> SoC errors post warm reset.
the commit msg is not very clear, how fatal error reporting is related to other errors.
Thanks,
Aravind.
>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> drivers/gpu/drm/xe/xe_hw_error.c | 37 ++++++++++++++++++++++++++++++++
> drivers/gpu/drm/xe/xe_hw_error.h | 1 +
> drivers/gpu/drm/xe/xe_irq.c | 1 +
> 3 files changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> index 0bcb1bea7ffb..a777c887a7be 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -366,6 +366,43 @@ static void xe_assign_hw_err_regs(struct xe_device *xe)
> }
> }
>
> +void xe_clear_all_soc_errors(struct xe_device *xe)
> +{
> + enum hardware_error hw_err;
> + u32 base, slave_base;
> + struct xe_tile *tile;
> + struct xe_gt *gt;
> + unsigned int i;
> +
> + base = SOC_PVC_BASE;
> + slave_base = SOC_PVC_SLAVE_BASE;
> +
> + hw_err = HARDWARE_ERROR_CORRECTABLE;
> +
> + for_each_tile(tile, xe, i) {
> + gt = tile->primary_gt;
> +
> + while (hw_err < HARDWARE_ERROR_MAX) {
> + for (i = 0; i < PVC_NUM_IEH; i++)
> + xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
> + ~REG_BIT(hw_err));
> +
> + xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err),
> + REG_GENMASK(31, 0));
> + xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err),
> + REG_GENMASK(31, 0));
> + xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
> + REG_GENMASK(31, 0));
> + xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
> + REG_GENMASK(31, 0));
> + hw_err++;
> + }
> + for (i = 0; i < PVC_NUM_IEH; i++)
> + xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
> + (HARDWARE_ERROR_MAX << 1) + 1);
> + }
> +}
> +
> static void
> xe_gt_hw_error_status_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
> {
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
> index a458a90b34a2..7ada7c97c939 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.h
> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
> @@ -219,4 +219,5 @@ struct xe_tile;
> void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
> void xe_process_hw_errors(struct xe_device *xe);
> void xe_gsc_hw_error_work(struct work_struct *work);
> +void xe_clear_all_soc_errors(struct xe_device *xe);
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> index 285c657cc789..42a6bb45acba 100644
> --- a/drivers/gpu/drm/xe/xe_irq.c
> +++ b/drivers/gpu/drm/xe/xe_irq.c
> @@ -597,6 +597,7 @@ int xe_irq_install(struct xe_device *xe)
> }
>
> xe_process_hw_errors(xe);
> + xe_clear_all_soc_errors(xe);
>
> xe->irq.enabled = true;
>
More information about the Intel-xe
mailing list