[Intel-xe] [PATCH v4 10/10] drm/xe: Clear all SoC errors post warm reset.

Thu Oct 19 08:26:57 UTC 2023

On 18/10/23 09:30, Himal Prasad Ghimiray wrote:
> There are scenarios where there are errors being reported from the SoC
> uncore to IEH and not propagated to SG unit. Since these errors are not
> propagated to SG unit, driver won't be able to clean them as part of
> xe_process_hw_error. Hence clear all SoC register post xe_process_hw_error
> during the driver load.
>
> v2
> - Fix commit message.
>
> v3
> - Limit check to PVC.
>
> v4
> - Fix check
>
> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_hw_error.c | 40 ++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_hw_error.h |  1 +
>  drivers/gpu/drm/xe/xe_irq.c      |  1 +
>  3 files changed, 42 insertions(+)
>
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> index 5700f708425f..381bd7180b80 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.c
> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> @@ -519,6 +519,46 @@ xe_gt_hw_error_log_vector_reg(struct xe_gt *gt, const enum hardware_error hw_err
>  	}
>  }
>  
> +void xe_clear_all_soc_errors(struct xe_device *xe)
> +{
> +	enum hardware_error hw_err;
> +	u32 base, slave_base;
> +	struct xe_tile *tile;
> +	struct xe_gt *gt;
> +	unsigned int i;
> +
> +	if (xe->info.platform != XE_PVC)
> +		return;
> +
> +	base = SOC_PVC_BASE;
> +	slave_base = SOC_PVC_SLAVE_BASE;
> +
> +	hw_err = HARDWARE_ERROR_CORRECTABLE;
nit: can be assigned when it is defined.
> +
> +	for_each_tile(tile, xe, i) {
> +		gt = tile->primary_gt;
> +
> +		while (hw_err < HARDWARE_ERROR_MAX) {
> +			for (i = 0; i < XE_SOC_NUM_IEH; i++)
> +				xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
> +						~REG_BIT(hw_err));
> +
> +			xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err),
> +					REG_GENMASK(31, 0));
> +			xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err),
> +					REG_GENMASK(31, 0));
> +			xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
> +					REG_GENMASK(31, 0));
> +			xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
> +					REG_GENMASK(31, 0));
> +			hw_err++;
> +		}
> +		for (i = 0; i < XE_SOC_NUM_IEH; i++)
> +			xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
> +					(HARDWARE_ERROR_MAX << 1) + 1);
> +	}
> +}
> +
>  static void
>  xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>  {
> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
> index 59b331f52783..f27c166455da 100644
> --- a/drivers/gpu/drm/xe/xe_hw_error.h
> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
> @@ -206,4 +206,5 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
>  void xe_assign_hw_err_regs(struct xe_device *xe);
>  void xe_process_hw_errors(struct xe_device *xe);
>  void xe_gsc_hw_error_work(struct work_struct *work);
> +void xe_clear_all_soc_errors(struct xe_device *xe);
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
> index bc0f01a2abc1..93cb948de516 100644
> --- a/drivers/gpu/drm/xe/xe_irq.c
> +++ b/drivers/gpu/drm/xe/xe_irq.c
> @@ -626,6 +626,7 @@ int xe_irq_install(struct xe_device *xe)
>  
>  	xe_assign_hw_err_regs(xe);
>  	xe_process_hw_errors(xe);
> +	xe_clear_all_soc_errors(xe);
>  
>  	xe_irq_reset(xe);

Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>

Thanks,
Aravind.
>