[Intel-xe] [PATCH v5 4/4] drm/xe: Process fatal hardware errors.

Tue Sep 26 10:24:46 UTC 2023

On 26-09-2023 09:51, Aravind Iddamsetty wrote:
> On 23/08/23 14:28, Himal Prasad Ghimiray wrote:
>> Fatal errors are reported as PCIe errors. When a PCIe error is asserted,
>> the OS will perform a device warm reset which causes the driver to reload.
>> The error registers are sticky and the values are maintained through a
>> warm reset. We read these registers during the boot flow of the driver and
>> increment the respective error counters.
>>
>> Bspec: 53076
> please mention 50875 as well.
sure.
>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: Matt Roper <matthew.d.roper at intel.com>
>> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> ---
>>   drivers/gpu/drm/xe/regs/xe_regs.h |  3 +++
>>   drivers/gpu/drm/xe/xe_hw_error.c  | 37 ++++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/xe/xe_hw_error.h  |  3 ++-
>>   drivers/gpu/drm/xe/xe_irq.c       |  2 +-
>>   4 files changed, 42 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
>> index e223975a5acf..b8f2b1762d3f 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_regs.h
>> @@ -90,5 +90,8 @@
>>   #define   GT_DW_IRQ(x)				REG_BIT(x)
>>   #define   XE_ERROR_IRQ(x)			REG_BIT(26 + (x))
>>   
>> +#define DEV_PCIEERR_STATUS			XE_REG(0x100180)
>> +#define   DEV_PCIEERR_IS_FATAL(x)		REG_BIT(x * 4 + 2)
> place it as per the address.
>> +
>>   #define PVC_RP_STATE_CAP			XE_REG(0x281014)
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>> index deb020a509d2..9595e3369656 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -183,7 +183,7 @@ static const struct err_msg_cntr_pair err_stat_gt_correctable_vectr_reg[] = {
>>   	[2 ... 3]         = {"L3BANK",		XE_GT_HW_ERR_L3BANK_CORR},
>>   };
>>   
>> -void xe_assign_hw_err_regs(struct xe_device *xe)
>> +static void xe_assign_hw_err_regs(struct xe_device *xe)
> change it in the patch that introduced it.
>>   {
>>   	const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
>>   	const struct err_msg_cntr_pair **err_stat_gt = xe->hw_err_regs.err_stat_gt;
>> @@ -417,3 +417,38 @@ xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
>>   			xe_hw_error_source_handler(tile, hw_err);
>>   	}
>>   }
>> +
>> +/**
>> + * process_hw_errors - checks for the occurrence of HW errors
>> + *
>> + * This checks for the HW Errors including FATAL errors that might
>> + * have occurred in the previous boot of the driver which will
>> + * initiate PCIe FLR reset of the device and cause the
> this is not right, in fact it is better to rephrase as:
>
> "fatal will result in a card warm reset and driver will be reloaded."
ok
>
>> + * driver to reload.
>> + */
>> +void xe_process_hw_errors(struct xe_device *xe)
>> +{
>> +	struct xe_tile *root_tile = xe_device_get_root_tile(xe);
>> +	struct xe_gt *root_mmio = root_tile->primary_gt;
>> +
>> +	u32 dev_pcieerr_status, master_ctl;
>> +	struct xe_tile *tile;
>> +	int i;
>> +
>> +	xe_assign_hw_err_regs(xe);
> lets still have this in xe_irq_install, as it serves for a different purpose.
ok
>> +
>> +	dev_pcieerr_status = xe_mmio_read32(root_mmio, DEV_PCIEERR_STATUS);
>> +
>> +	for_each_tile(tile, xe, i) {
>> +		struct xe_gt *mmio = tile->primary_gt;
>> +
>> +		if (dev_pcieerr_status & DEV_PCIEERR_IS_FATAL(i))
>> +			xe_hw_error_source_handler(tile, HARDWARE_ERROR_FATAL);
>> +
>> +		master_ctl = xe_mmio_read32(mmio, GFX_MSTR_IRQ);
>> +		xe_hw_error_irq_handler(tile, master_ctl);
>> +		xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
>> +	}
>> +	if (dev_pcieerr_status)
>> +		xe_mmio_write32(root_mmio, DEV_PCIEERR_STATUS, dev_pcieerr_status);
>> +}
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>> index 3fcbbcc338fe..2812407dd4bf 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -104,5 +104,6 @@ struct xe_device;
>>   struct xe_tile;
>>   
>>   void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
>> -void xe_assign_hw_err_regs(struct xe_device *xe);
>> +void xe_process_hw_errors(struct xe_device *xe);
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>> index 48b933234342..be152ebd6ce9 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -573,7 +573,7 @@ int xe_irq_install(struct xe_device *xe)
>>   		return -EINVAL;
>>   	}
>>   
>> -	xe_assign_hw_err_regs(xe);
>> +	xe_process_hw_errors(xe);
> this shall be called before xe_irq_reset.
>>   
>>   	xe->irq.enabled = true;
>>   
> Thanks,
> Aravind.