[Intel-xe] [PATCH 11/11] drm/xe: Clear all SoC errors post warm reset.

Wed Oct 11 06:59:39 UTC 2023

On 11-10-2023 12:26, Aravind Iddamsetty wrote:
> On 27/09/23 17:16, Himal Prasad Ghimiray wrote:
>> There are scenarios where there are no fatal errors reported
>> but Non-fatal/correctable errors being reported from the SoC
>> uncore to IEH and not propogated to SG unit. Clear all previous
>> SoC errors post warm reset.
> the commit msg is not very clear, how fatal error reporting is related to other errors.
Will rephrase it as

There are scenarios where there are errors being reported from the SoC
uncore to IEH and not propagated to SG unit. Since these errors are not propagated to SG unit,
driver wont be able to clean them as part of xe_process_hw_error. Hence clear all SoC register post
xe_process_hw_error.

Is it ok ?

>
> Thanks,
> Aravind.
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_hw_error.c | 37 ++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/xe/xe_hw_error.h |  1 +
>>   drivers/gpu/drm/xe/xe_irq.c      |  1 +
>>   3 files changed, 39 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>> index 0bcb1bea7ffb..a777c887a7be 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -366,6 +366,43 @@ static void xe_assign_hw_err_regs(struct xe_device *xe)
>>   	}
>>   }
>>   
>> +void xe_clear_all_soc_errors(struct xe_device *xe)
>> +{
>> +	enum hardware_error hw_err;
>> +	u32 base, slave_base;
>> +	struct xe_tile *tile;
>> +	struct xe_gt *gt;
>> +	unsigned int i;
>> +
>> +	base = SOC_PVC_BASE;
>> +	slave_base = SOC_PVC_SLAVE_BASE;
>> +
>> +	hw_err = HARDWARE_ERROR_CORRECTABLE;
>> +
>> +	for_each_tile(tile, xe, i) {
>> +		gt = tile->primary_gt;
>> +
>> +		while (hw_err < HARDWARE_ERROR_MAX) {
>> +			for (i = 0; i < PVC_NUM_IEH; i++)
>> +				xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
>> +						~REG_BIT(hw_err));
>> +
>> +			xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err),
>> +					REG_GENMASK(31, 0));
>> +			xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err),
>> +					REG_GENMASK(31, 0));
>> +			xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
>> +					REG_GENMASK(31, 0));
>> +			xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err),
>> +					REG_GENMASK(31, 0));
>> +			hw_err++;
>> +		}
>> +		for (i = 0; i < PVC_NUM_IEH; i++)
>> +			xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i),
>> +					(HARDWARE_ERROR_MAX << 1) + 1);
>> +	}
>> +}
>> +
>>   static void
>>   xe_gt_hw_error_status_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>>   {
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>> index a458a90b34a2..7ada7c97c939 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -219,4 +219,5 @@ struct xe_tile;
>>   void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl);
>>   void xe_process_hw_errors(struct xe_device *xe);
>>   void xe_gsc_hw_error_work(struct work_struct *work);
>> +void xe_clear_all_soc_errors(struct xe_device *xe);
>>   #endif
>> diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
>> index 285c657cc789..42a6bb45acba 100644
>> --- a/drivers/gpu/drm/xe/xe_irq.c
>> +++ b/drivers/gpu/drm/xe/xe_irq.c
>> @@ -597,6 +597,7 @@ int xe_irq_install(struct xe_device *xe)
>>   	}
>>   
>>   	xe_process_hw_errors(xe);
>> +	xe_clear_all_soc_errors(xe);
>>   
>>   	xe->irq.enabled = true;
>>