[Intel-xe] [PATCH v5 3/4] drm/xe: Support GT hardware error reporting for PVC.
Ghimiray, Himal Prasad
himal.prasad.ghimiray at intel.com
Tue Sep 26 05:11:55 UTC 2023
On 26-09-2023 09:51, Aravind Iddamsetty wrote:
> On 23/08/23 14:28, Himal Prasad Ghimiray wrote:
>> PVC supports GT error reporting via vector registers alongwith
>> error status register. Add support to report these errors and
>> update respective counters.
>> Incase of Subslice error reported by vector register, process the
>> error status register for applicable bits.
>>
>> Bspec: 54179, 54177, 53088, 53089
>>
>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>> Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: Matt Roper <matthew.d.roper at intel.com>
>> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
>> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> ---
>> drivers/gpu/drm/xe/regs/xe_gt_error_regs.h | 16 +++
>> drivers/gpu/drm/xe/xe_hw_error.c | 122 ++++++++++++++++++++-
>> drivers/gpu/drm/xe/xe_hw_error.h | 20 ++++
>> 3 files changed, 154 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
>> index 6180704a6149..39ea87914465 100644
>> --- a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
>> +++ b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
>> @@ -10,4 +10,20 @@
>> #define ERR_STAT_GT_REG(x) XE_REG(_PICK_EVEN((x), \
>> _ERR_STAT_GT_COR, \
>> _ERR_STAT_GT_NONFATAL))
>> +
>> +#define _ERR_STAT_GT_COR_VCTR_0 0x1002a0
>> +#define _ERR_STAT_GT_COR_VCTR_1 0x1002a4
>> +#define ERR_STAT_GT_COR_VCTR_REG(x) XE_REG(_PICK_EVEN((x), \
>> + _ERR_STAT_GT_COR_VCTR_0, \
>> + _ERR_STAT_GT_COR_VCTR_1))
>> +
>> +#define _ERR_STAT_GT_FATAL_VCTR_0 0x100260
>> +#define _ERR_STAT_GT_FATAL_VCTR_1 0x100264
> the registers shall be defined in the ascending order of their addresses.
sure.
>> +#define ERR_STAT_GT_FATAL_VCTR_REG(x) XE_REG(_PICK_EVEN((x), \
>> + _ERR_STAT_GT_FATAL_VCTR_0, \
>> + _ERR_STAT_GT_FATAL_VCTR_1))
>> +
>> +#define ERR_STAT_GT_VCTR_REG(hw_err, x) (hw_err == HARDWARE_ERROR_CORRECTABLE ? \
>> + ERR_STAT_GT_COR_VCTR_REG(x) : \
>> + ERR_STAT_GT_FATAL_VCTR_REG(x))
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
>> index 10aad0c396fb..deb020a509d2 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.c
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.c
>> @@ -148,6 +148,41 @@ static const struct err_msg_cntr_pair err_stat_gt_correctable_reg[] = {
>> [16 ... 31] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_CORR},
>> };
>>
>> +static const struct err_msg_cntr_pair pvc_err_stat_gt_fatal_reg[] = {
>> + [0 ... 2] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_FATAL},
>> + [3] = {"FPU", XE_GT_HW_ERR_FPU_FATAL},
>> + [4 ... 5] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_FATAL},
>> + [6] = {"GUC SRAM", XE_GT_HW_ERR_GUC_FATAL},
>> + [7 ... 12] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_FATAL},
>> + [13] = {"SLM", XE_GT_HW_ERR_SLM_FATAL},
>> + [14] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_FATAL},
>> + [15] = {"EU GRF", XE_GT_HW_ERR_EU_GRF_FATAL},
>> + [16 ... 31] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_FATAL},
>> +};
> let all fatal definitions be moved into the patch in which fatal is processed
Squashing patch 4 with pacth 1.
>> +
>> +static const struct err_msg_cntr_pair pvc_err_stat_gt_correctable_reg[] = {
>> + [0] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_CORR},
>> + [1] = {"SINGLE BIT GUC SRAM", XE_GT_HW_ERR_GUC_CORR},
>> + [2 ... 12] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_CORR},
>> + [13] = {"SINGLE BIT SLM", XE_GT_HW_ERR_SLM_CORR},
>> + [14] = {"SINGLE BIT EU IC", XE_GT_HW_ERR_EU_IC_CORR},
>> + [15] = {"SINGLE BIT EU GRF", XE_GT_HW_ERR_EU_GRF_CORR},
>> + [16 ... 31] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_CORR},
>> +};
>> +
>> +static const struct err_msg_cntr_pair err_stat_gt_fatal_vectr_reg[] = {
>> + [0 ... 1] = {"SUBSLICE", XE_GT_HW_ERR_SUBSLICE_FATAL},
>> + [2 ... 3] = {"L3BANK", XE_GT_HW_ERR_L3BANK_FATAL},
>> + [4 ... 5] = {"Undefined", XE_GT_HW_ERR_UNKNOWN_FATAL},
>> + [6] = {"TLB", XE_GT_HW_ERR_TLB_FATAL},
>> + [7] = {"L3 FABRIC", XE_GT_HW_ERR_L3_FABRIC_FATAL},
>> +};
>> +
>> +static const struct err_msg_cntr_pair err_stat_gt_correctable_vectr_reg[] = {
>> + [0 ... 1] = {"SUBSLICE", XE_GT_HW_ERR_SUBSLICE_CORR},
>> + [2 ... 3] = {"L3BANK", XE_GT_HW_ERR_L3BANK_CORR},
>> +};
>> +
>> void xe_assign_hw_err_regs(struct xe_device *xe)
>> {
>> const struct err_msg_cntr_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
>> @@ -164,6 +199,8 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
>> dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
>> dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
>> dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
>> + err_stat_gt[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_gt_correctable_reg;
>> + err_stat_gt[HARDWARE_ERROR_FATAL] = pvc_err_stat_gt_fatal_reg;
>> } else {
>> /* For other platforms report only GT errors */
>> dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = dev_err_stat_correctable_reg;
>> @@ -176,7 +213,7 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
>> }
>>
>> static void
>> -xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>> +xe_gt_hw_error_status_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
> is xe_gt_hw_error_log_status_reg sounding better ? and have this in the earlier patch
Ok
>> {
>> const char *hw_err_str = hardware_error_type_to_str(hw_err);
>> const struct err_msg_cntr_pair *errstat;
>> @@ -186,9 +223,6 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>> u32 indx;
>> u32 errbit;
>>
>> - if (gt_to_xe(gt)->info.platform == XE_PVC)
>> - return;
>> -
>> lockdep_assert_held(>_to_xe(gt)->irq.lock);
>> err_regs = >_to_xe(gt)->hw_err_regs;
>> errsrc = xe_mmio_read32(gt, ERR_STAT_GT_REG(hw_err));
>> @@ -230,6 +264,86 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>> clear_reg: xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err), errsrc);
>> }
>>
>> +static void
>> +xe_gt_hw_error_vectr_reg_handler(struct xe_gt *gt, const enum hardware_error hw_err)
> similarly, xe_gt_hw_error_log_vector_reg
ok
>> +{
>> + const char *hw_err_str = hardware_error_type_to_str(hw_err);
>> + const struct err_msg_cntr_pair *errvctr;
>> + const char *errmsg;
>> + bool errstat_read;
>> + u32 num_vctr_reg;
>> + u32 indx;
>> + u32 vctr;
>> + u32 i;
>> +
>> + switch (hw_err) {
>> + case HARDWARE_ERROR_FATAL:
>> + num_vctr_reg = ERR_STAT_GT_FATAL_VCTR_LEN;
>> + errvctr = err_stat_gt_fatal_vectr_reg;
> why don't we define registers once like it's done in xe_assign_hw_err_regs.
>> + break;
>> + case HARDWARE_ERROR_NONFATAL:
>> + /* The GT Non Fatal Error Status Register has only reserved bits
>> + * Nothing to service.
>> + */
>> + drm_err_ratelimited(>_to_xe(gt)->drm, HW_ERR "GT%d detected %s error\n",
>> + gt->info.id, hw_err_str);
>> + return;
>> + case HARDWARE_ERROR_CORRECTABLE:
>> + num_vctr_reg = ERR_STAT_GT_COR_VCTR_LEN;
>> + errvctr = err_stat_gt_correctable_vectr_reg;
>> + break;
>> + default:
>> + return;
>> + }
>> +
>> + errstat_read = false;
>> +
>> + for (i = 0 ; i < num_vctr_reg; i++) {
>> + vctr = xe_mmio_read32(gt, ERR_STAT_GT_VCTR_REG(hw_err, i));
>> + if (!vctr)
>> + continue;
>> +
>> + errmsg = errvctr[i].errmsg;
>> + indx = errvctr[i].cntr_indx;
>> +
>> + if (hw_err == HARDWARE_ERROR_FATAL)
>> + drm_err_ratelimited(>_to_xe(gt)->drm, HW_ERR
>> + "GT%d detected %s %s error. ERR_VECT_GT_%s[%d]:0x%08x\n",
>> + gt->info.id, errmsg, hw_err_str, hw_err_str, i, vctr);
>> + else
>> + drm_warn(>_to_xe(gt)->drm, HW_ERR
>> + "GT%d detected %s %s error. ERR_VECT_GT_%s[%d]:0x%08x\n",
>> + gt->info.id, errmsg, hw_err_str, hw_err_str, i, vctr);
>> +
>> + if (i < ERR_STAT_GT_VCTR4)
>> + gt->errors.count[indx] += hweight32(vctr);
>> +
>> + if (i == ERR_STAT_GT_VCTR6)
>> + gt->errors.count[indx] += hweight16(vctr);
>> +
>> + if (i == ERR_STAT_GT_VCTR7)
>> + gt->errors.count[indx] += hweight8(vctr);
>> +
>> + if (i < ERR_STAT_GT_VCTR2 && !errstat_read) {
>> + xe_gt_hw_error_status_reg_handler(gt, hw_err);
>> + errstat_read = true;
> what is the need of errstat_read, isn't i < ERR_STAT_GT_VCTR2 sufficient
In case we see errors from both vect0 and vect1. We shouldn't be
servicing error status register twice.
>
> and also instead of if check for every i we can have switch case
>
> switch (i) {
>
> case ERR_STAT_GT_VCTR0:
> case ERR_STAT_GT_VCTR1:
> case ERR_STAT_GT_VCTR2:
> case ERR_STAT_GT_VCTR3:
>
> gt->errors.count[indx] += hweight32(vctr);
>
> if ( i < ERR_STAT_GT_VCTR2)
>
> xe_gt_hw_error_status_reg_handler(gt, hw_err);
>
> break;
>
> case ERR_STAT_GT_VCTR6:
> case ERR_STAT_GT_VCTR7:
>
> gt->errors.count[indx] += (i == ERR_STAT_GT_VCTR6) ? hweight16(vctr) : hweight8(vctr);
> break;
>
> default:
Looks clean. Will use this.
>
> }
>
>> + }
>> +
>> + xe_mmio_write32(gt, ERR_STAT_GT_VCTR_REG(hw_err, i), vctr);
>> + }
>> +}
>> +
>> +static void
>> +xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
>> +{
>> + lockdep_assert_held(>_to_xe(gt)->irq.lock);
>> +
>> + if (gt_to_xe(gt)->info.platform == XE_PVC)
>> + xe_gt_hw_error_vectr_reg_handler(gt, hw_err);
>> + else
>> + xe_gt_hw_error_status_reg_handler(gt, hw_err);
>> +}
>> +
>> static void
>> xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
>> {
>> diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
>> index 82c947247c27..3fcbbcc338fe 100644
>> --- a/drivers/gpu/drm/xe/xe_hw_error.h
>> +++ b/drivers/gpu/drm/xe/xe_hw_error.h
>> @@ -51,8 +51,21 @@ enum xe_tile_hw_errors {
>> XE_TILE_HW_ERROR_MAX,
>> };
>>
>> +enum gt_vctr_registers {
>> + ERR_STAT_GT_VCTR0 = 0,
>> + ERR_STAT_GT_VCTR1,
>> + ERR_STAT_GT_VCTR2,
>> + ERR_STAT_GT_VCTR3,
>> + ERR_STAT_GT_VCTR4,
>> + ERR_STAT_GT_VCTR5,
>> + ERR_STAT_GT_VCTR6,
>> + ERR_STAT_GT_VCTR7,
>> +};
>> +
>> /* Count of GT Correctable and FATAL HW ERRORS */
>> enum xe_gt_hw_errors {
>> + XE_GT_HW_ERR_SUBSLICE_CORR,
>> + XE_GT_HW_ERR_L3BANK_CORR,
>> XE_GT_HW_ERR_L3_SNG_CORR,
>> XE_GT_HW_ERR_GUC_CORR,
>> XE_GT_HW_ERR_SAMPLER_CORR,
>> @@ -60,6 +73,8 @@ enum xe_gt_hw_errors {
>> XE_GT_HW_ERR_EU_IC_CORR,
>> XE_GT_HW_ERR_EU_GRF_CORR,
>> XE_GT_HW_ERR_UNKNOWN_CORR,
>> + XE_GT_HW_ERR_SUBSLICE_FATAL,
>> + XE_GT_HW_ERR_L3BANK_FATAL,
>> XE_GT_HW_ERR_ARR_BIST_FATAL,
>> XE_GT_HW_ERR_FPU_FATAL,
>> XE_GT_HW_ERR_L3_DOUB_FATAL,
>> @@ -71,10 +86,15 @@ enum xe_gt_hw_errors {
>> XE_GT_HW_ERR_SLM_FATAL,
>> XE_GT_HW_ERR_EU_IC_FATAL,
>> XE_GT_HW_ERR_EU_GRF_FATAL,
>> + XE_GT_HW_ERR_TLB_FATAL,
>> + XE_GT_HW_ERR_L3_FABRIC_FATAL,
>> XE_GT_HW_ERR_UNKNOWN_FATAL,
>> XE_GT_HW_ERROR_MAX,
>> };
>>
>> +#define ERR_STAT_GT_COR_VCTR_LEN (4)
>> +#define ERR_STAT_GT_FATAL_VCTR_LEN (8)
>> +
>> struct err_msg_cntr_pair {
>> const char *errmsg;
>> const u32 cntr_indx;
> Thanks,
> Aravind.
More information about the Intel-xe
mailing list