[Intel-xe] [PATCH v7 04/12] drm/xe: Support GT hardware error reporting for PVC.
Himal Prasad Ghimiray
himal.prasad.ghimiray at intel.com
Fri Oct 20 04:55:25 UTC 2023
PVC supports GT error reporting via vector registers alongwith
error status register. Add support to report these errors and
update respective counters.
Incase of Subslice error reported by vector register, process the
error status register for applicable bits.
Bspec: 54179, 54177, 53088, 53089
v6
- Define registers ascending order of their addresses.
- use xe_gt_hw_error_log_vector_reg instead of
xe_gt_hw_error_vectr_reg_handler.
- use xe_assign_hw_err_regs for reg initialization.
- use switch-case instead of if-else.
v7
- Use all vctr for correctable too.
- Use helper functions to log gt hardware errors. (Aravind)
Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Matt Roper <matthew.d.roper at intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Reviewed-by: Aravind Iddamsetty <aravind.iddamsetty at linux.intel.com>
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
---
drivers/gpu/drm/xe/regs/xe_gt_error_regs.h | 16 +++
drivers/gpu/drm/xe/xe_device_types.h | 1 +
drivers/gpu/drm/xe/xe_hw_error.c | 124 ++++++++++++++++++++-
drivers/gpu/drm/xe/xe_hw_error.h | 19 ++++
4 files changed, 158 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
index 6180704a6149..59631c2e8e12 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_error_regs.h
@@ -10,4 +10,20 @@
#define ERR_STAT_GT_REG(x) XE_REG(_PICK_EVEN((x), \
_ERR_STAT_GT_COR, \
_ERR_STAT_GT_NONFATAL))
+
+#define _ERR_STAT_GT_FATAL_VCTR_0 0x100260
+#define _ERR_STAT_GT_FATAL_VCTR_1 0x100264
+#define ERR_STAT_GT_FATAL_VCTR_REG(x) XE_REG(_PICK_EVEN((x), \
+ _ERR_STAT_GT_FATAL_VCTR_0, \
+ _ERR_STAT_GT_FATAL_VCTR_1))
+
+#define _ERR_STAT_GT_COR_VCTR_0 0x1002a0
+#define _ERR_STAT_GT_COR_VCTR_1 0x1002a4
+#define ERR_STAT_GT_COR_VCTR_REG(x) XE_REG(_PICK_EVEN((x), \
+ _ERR_STAT_GT_COR_VCTR_0, \
+ _ERR_STAT_GT_COR_VCTR_1))
+
+#define ERR_STAT_GT_VCTR_REG(hw_err, x) (hw_err == HARDWARE_ERROR_CORRECTABLE ? \
+ ERR_STAT_GT_COR_VCTR_REG(x) : \
+ ERR_STAT_GT_FATAL_VCTR_REG(x))
#endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 675cf0c00be2..4624f2f53b26 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -415,6 +415,7 @@ struct xe_device {
struct hardware_errors_regs {
const struct err_name_index_pair *dev_err_stat[HARDWARE_ERROR_MAX];
const struct err_name_index_pair *err_stat_gt[HARDWARE_ERROR_MAX];
+ const struct err_name_index_pair *err_vctr_gt[HARDWARE_ERROR_MAX];
} hw_err_regs;
/* private: */
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index c4bc24a35231..c94825b9c1fe 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -131,10 +131,47 @@ static const struct err_name_index_pair dg2_stat_gt_correctable_reg[] = {
[16 ... 31] = {"Undefined", XE_HW_ERR_GT_CORR_UNKNOWN},
};
+static const struct err_name_index_pair pvc_err_stat_gt_fatal_reg[] = {
+ [0 ... 2] = {"Undefined", XE_HW_ERR_GT_FATAL_UNKNOWN},
+ [3] = {"FPU", XE_HW_ERR_GT_FATAL_FPU},
+ [4 ... 5] = {"Undefined", XE_HW_ERR_GT_FATAL_UNKNOWN},
+ [6] = {"GUC SRAM", XE_HW_ERR_GT_FATAL_GUC},
+ [7 ... 12] = {"Undefined", XE_HW_ERR_GT_FATAL_UNKNOWN},
+ [13] = {"SLM", XE_HW_ERR_GT_FATAL_SLM},
+ [14] = {"Undefined", XE_HW_ERR_GT_FATAL_UNKNOWN},
+ [15] = {"EU GRF", XE_HW_ERR_GT_FATAL_EU_GRF},
+ [16 ... 31] = {"Undefined", XE_HW_ERR_GT_FATAL_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_err_stat_gt_correctable_reg[] = {
+ [0] = {"Undefined", XE_HW_ERR_GT_CORR_UNKNOWN},
+ [1] = {"SINGLE BIT GUC SRAM", XE_HW_ERR_GT_CORR_GUC},
+ [2 ... 12] = {"Undefined", XE_HW_ERR_GT_CORR_UNKNOWN},
+ [13] = {"SINGLE BIT SLM", XE_HW_ERR_GT_CORR_SLM},
+ [14] = {"SINGLE BIT EU IC", XE_HW_ERR_GT_CORR_EU_IC},
+ [15] = {"SINGLE BIT EU GRF", XE_HW_ERR_GT_CORR_EU_GRF},
+ [16 ... 31] = {"Undefined", XE_HW_ERR_GT_CORR_UNKNOWN},
+};
+
+static const struct err_name_index_pair pvc_err_vectr_gt_fatal_reg[] = {
+ [0 ... 1] = {"SUBSLICE", XE_HW_ERR_GT_FATAL_SUBSLICE},
+ [2 ... 3] = {"L3BANK", XE_HW_ERR_GT_FATAL_L3BANK},
+ [4 ... 5] = {"Undefined", XE_HW_ERR_GT_FATAL_UNKNOWN},
+ [6] = {"TLB", XE_HW_ERR_GT_FATAL_TLB},
+ [7] = {"L3 FABRIC", XE_HW_ERR_GT_FATAL_L3_FABRIC},
+};
+
+static const struct err_name_index_pair pvc_err_vectr_gt_correctable_reg[] = {
+ [0 ... 1] = {"SUBSLICE", XE_HW_ERR_GT_CORR_SUBSLICE},
+ [2 ... 3] = {"L3BANK", XE_HW_ERR_GT_CORR_L3BANK},
+ [4 ... 7] = {"Undefined", XE_HW_ERR_GT_CORR_UNKNOWN},
+};
+
void xe_assign_hw_err_regs(struct xe_device *xe)
{
const struct err_name_index_pair **dev_err_stat = xe->hw_err_regs.dev_err_stat;
const struct err_name_index_pair **err_stat_gt = xe->hw_err_regs.err_stat_gt;
+ const struct err_name_index_pair **err_vctr_gt = xe->hw_err_regs.err_vctr_gt;
/* Error reporting is supported only for DG2 and PVC currently. */
if (xe->info.platform == XE_DG2) {
@@ -149,6 +186,10 @@ void xe_assign_hw_err_regs(struct xe_device *xe)
dev_err_stat[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_correctable_reg;
dev_err_stat[HARDWARE_ERROR_NONFATAL] = pvc_err_stat_nonfatal_reg;
dev_err_stat[HARDWARE_ERROR_FATAL] = pvc_err_stat_fatal_reg;
+ err_stat_gt[HARDWARE_ERROR_CORRECTABLE] = pvc_err_stat_gt_correctable_reg;
+ err_stat_gt[HARDWARE_ERROR_FATAL] = pvc_err_stat_gt_fatal_reg;
+ err_vctr_gt[HARDWARE_ERROR_CORRECTABLE] = pvc_err_vectr_gt_correctable_reg;
+ err_vctr_gt[HARDWARE_ERROR_FATAL] = pvc_err_vectr_gt_fatal_reg;
}
}
@@ -162,13 +203,14 @@ static bool xe_platform_has_ras(struct xe_device *xe)
}
static void
-xe_update_hw_error_cnt(struct drm_device *drm, struct xarray *hw_error, unsigned long index)
+xe_update_hw_error_cnt_with_value(struct drm_device *drm, struct xarray *hw_error,
+ unsigned long index, unsigned long val)
{
unsigned long flags;
void *entry;
entry = xa_load(hw_error, index);
- entry = xa_mk_value(xa_to_value(entry) + 1);
+ entry = xa_mk_value(xa_to_value(entry) + val);
xa_lock_irqsave(hw_error, flags);
if (xa_is_err(__xa_store(hw_error, index, entry, GFP_ATOMIC)))
@@ -177,6 +219,12 @@ xe_update_hw_error_cnt(struct drm_device *drm, struct xarray *hw_error, unsigned
xa_unlock_irqrestore(hw_error, flags);
}
+static void
+xe_update_hw_error_cnt(struct drm_device *drm, struct xarray *hw_error, unsigned long index)
+{
+ xe_update_hw_error_cnt_with_value(drm, hw_error, index, 1);
+}
+
static void
xe_gt_hw_error_log_status_reg(struct xe_gt *gt, const enum hardware_error hw_err)
{
@@ -188,6 +236,7 @@ xe_gt_hw_error_log_status_reg(struct xe_gt *gt, const enum hardware_error hw_err
u32 indx;
u32 errbit;
+ lockdep_assert_held(>_to_xe(gt)->irq.lock);
err_regs = >_to_xe(gt)->hw_err_regs;
errsrc = xe_mmio_read32(gt, ERR_STAT_GT_REG(hw_err));
if (!errsrc) {
@@ -224,6 +273,74 @@ xe_gt_hw_error_log_status_reg(struct xe_gt *gt, const enum hardware_error hw_err
xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err), errsrc);
}
+static void
+xe_gt_hw_error_log_vector_reg(struct xe_gt *gt, const enum hardware_error hw_err)
+{
+ const char *hw_err_str = hardware_error_type_to_str(hw_err);
+ const struct err_name_index_pair *errvctr;
+ struct hardware_errors_regs *err_regs;
+ const char *name;
+ bool errstat_read;
+ unsigned long val;
+ u32 num_vctr_reg;
+ u32 indx;
+ u32 vctr;
+ u32 i;
+
+ if (hw_err == HARDWARE_ERROR_NONFATAL) {
+ /* The GT Non Fatal Error Status Register has only reserved bits
+ * Nothing to service.
+ */
+ xe_gt_log_hw_err(gt, "%s error\n", hw_err_str);
+ return;
+ }
+
+ errstat_read = false;
+ num_vctr_reg = ERR_STAT_GT_VCTR_LEN;
+ err_regs = >_to_xe(gt)->hw_err_regs;
+ errvctr = err_regs->err_vctr_gt[hw_err];
+ for (i = 0 ; i < num_vctr_reg; i++) {
+ vctr = xe_mmio_read32(gt, ERR_STAT_GT_VCTR_REG(hw_err, i));
+ if (!vctr)
+ continue;
+
+ name = errvctr[i].name;
+ indx = errvctr[i].index;
+
+ if (hw_err == HARDWARE_ERROR_FATAL)
+ xe_gt_log_hw_err(gt, "%s %s error. ERR_VECT_GT_%s[%d]:0x%08x\n",
+ name, hw_err_str, hw_err_str, i, vctr);
+ else
+ xe_gt_log_hw_warn(gt, "%s %s error. ERR_VECT_GT_%s[%d]:0x%08x\n",
+ name, hw_err_str, hw_err_str, i, vctr);
+
+ switch (i) {
+ case ERR_STAT_GT_VCTR0:
+ case ERR_STAT_GT_VCTR1:
+ case ERR_STAT_GT_VCTR2:
+ case ERR_STAT_GT_VCTR3:
+ val = hweight32(vctr);
+ if (i < ERR_STAT_GT_VCTR2 && !errstat_read) {
+ xe_gt_hw_error_log_status_reg(gt, hw_err);
+ errstat_read = true;
+ }
+ xe_update_hw_error_cnt_with_value(>_to_xe(gt)->drm,
+ >->errors.hw_error, indx, val);
+ break;
+ case ERR_STAT_GT_VCTR6:
+ case ERR_STAT_GT_VCTR7:
+ val = (i == ERR_STAT_GT_VCTR6) ? hweight16(vctr) : hweight8(vctr);
+ xe_update_hw_error_cnt_with_value(>_to_xe(gt)->drm,
+ >->errors.hw_error, indx, val);
+ break;
+ default:
+ break;
+ }
+
+ xe_mmio_write32(gt, ERR_STAT_GT_VCTR_REG(hw_err, i), vctr);
+ }
+}
+
static void
xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
{
@@ -231,6 +348,9 @@ xe_gt_hw_error_handler(struct xe_gt *gt, const enum hardware_error hw_err)
if (gt_to_xe(gt)->info.platform == XE_DG2)
xe_gt_hw_error_log_status_reg(gt, hw_err);
+
+ if (gt_to_xe(gt)->info.platform == XE_PVC)
+ xe_gt_hw_error_log_vector_reg(gt, hw_err);
}
static void
diff --git a/drivers/gpu/drm/xe/xe_hw_error.h b/drivers/gpu/drm/xe/xe_hw_error.h
index e3f5379c341a..bea2bfaf90ad 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.h
+++ b/drivers/gpu/drm/xe/xe_hw_error.h
@@ -10,6 +10,8 @@
#define XE_RAS_REG_SIZE 32
+#define ERR_STAT_GT_VCTR_LEN (8)
+
/* Error categories reported by hardware */
enum hardware_error {
HARDWARE_ERROR_CORRECTABLE = 0,
@@ -37,8 +39,21 @@ enum xe_tile_hw_errors {
XE_HW_ERR_TILE_CORR_UNKNOWN,
};
+enum gt_vctr_registers {
+ ERR_STAT_GT_VCTR0 = 0,
+ ERR_STAT_GT_VCTR1,
+ ERR_STAT_GT_VCTR2,
+ ERR_STAT_GT_VCTR3,
+ ERR_STAT_GT_VCTR4,
+ ERR_STAT_GT_VCTR5,
+ ERR_STAT_GT_VCTR6,
+ ERR_STAT_GT_VCTR7,
+};
+
/* Count of GT Correctable and FATAL HW ERRORS */
enum xe_gt_hw_errors {
+ XE_HW_ERR_GT_CORR_SUBSLICE,
+ XE_HW_ERR_GT_CORR_L3BANK,
XE_HW_ERR_GT_CORR_L3_SNG,
XE_HW_ERR_GT_CORR_GUC,
XE_HW_ERR_GT_CORR_SAMPLER,
@@ -46,6 +61,10 @@ enum xe_gt_hw_errors {
XE_HW_ERR_GT_CORR_EU_IC,
XE_HW_ERR_GT_CORR_EU_GRF,
XE_HW_ERR_GT_CORR_UNKNOWN,
+ XE_HW_ERR_GT_FATAL_SUBSLICE,
+ XE_HW_ERR_GT_FATAL_L3BANK,
+ XE_HW_ERR_GT_FATAL_TLB,
+ XE_HW_ERR_GT_FATAL_L3_FABRIC,
XE_HW_ERR_GT_FATAL_ARR_BIST,
XE_HW_ERR_GT_FATAL_FPU,
XE_HW_ERR_GT_FATAL_L3_DOUB,
--
2.25.1
More information about the Intel-xe
mailing list