[Intel-xe] [PATCH 1/1] drm/xe/RAS: Count SOC and SGUNIT errors
Himal Prasad Ghimiray
himal.prasad.ghimiray at intel.com
Thu Mar 16 04:05:43 UTC 2023
Count the SOC, SGUNIT hardware errors as signaled by relevant
interrupts and respective registers.
Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
---
drivers/gpu/drm/xe/regs/xe_regs.h | 192 +++++++++++++++++++-----
drivers/gpu/drm/xe/xe_device.c | 5 +
drivers/gpu/drm/xe/xe_gt.c | 1 +
drivers/gpu/drm/xe/xe_gt_types.h | 28 ++++
drivers/gpu/drm/xe/xe_irq.c | 241 ++++++++++++++++++++++++++++++
5 files changed, 426 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
index fb772c8b12f5..65db44f2171e 100644
--- a/drivers/gpu/drm/xe/regs/xe_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_regs.h
@@ -92,10 +92,10 @@
#define GEN11_GU_MISC_IRQ (1 << 29)
#define GEN11_DISPLAY_IRQ (1 << 16)
#define GEN11_GT_DW_IRQ(x) (1 << (x))
-#define GEN12_FATAL_ERROR_IRQ REG_BIT(28)
-#define GEN12_NON_FATAL_ERROR_IRQ REG_BIT(27)
-#define GEN12_CORRECTABLE_ERROR_IRQ REG_BIT(26)
-#define GEN12_ERROR_IRQ(x) REG_BIT(26 + (x))
+#define GEN12_FATAL_ERROR_IRQ REG_BIT(28)
+#define GEN12_NON_FATAL_ERROR_IRQ REG_BIT(27)
+#define GEN12_CORRECTABLE_ERROR_IRQ REG_BIT(26)
+#define GEN12_ERROR_IRQ(x) REG_BIT(26 + (x))
#define DG1_MSTR_TILE_INTR _MMIO(0x190008)
#define DG1_MSTR_IRQ REG_BIT(31)
@@ -122,20 +122,20 @@ enum hardware_error {
HARDWARE_ERROR_MAX,
};
-#define DEV_PCIEERR_STATUS _MMIO(0x100180)
+#define DEV_PCIEERR_STATUS _MMIO(0x100180)
#define DEV_PCIEERR_TILE_STATUS_MASK REG_GENMASK(2, 0)
#define DEV_PCIEERR_TILE_STATUS(x) (DEV_PCIEERR_TILE_STATUS_MASK << (x * 4))
-#define DEV_PCIEERR_IS_FATAL(x) (REG_BIT(2) << (x * 4))
-#define _DEV_ERR_STAT_FATAL 0x100174
-#define _DEV_ERR_STAT_NONFATAL 0x100178
+#define DEV_PCIEERR_IS_FATAL(x) (REG_BIT(2) << (x * 4))
+#define _DEV_ERR_STAT_FATAL 0x100174
+#define _DEV_ERR_STAT_NONFATAL 0x100178
#define _DEV_ERR_STAT_CORRECTABLE 0x10017c
-#define DEV_ERR_STAT_REG(x) _MMIO(_PICK_EVEN((x), \
+#define DEV_ERR_STAT_REG(x) _MMIO(_PICK_EVEN((x), \
_DEV_ERR_STAT_CORRECTABLE, \
_DEV_ERR_STAT_NONFATAL))
-#define DEV_ERR_STAT_SOC_ERROR REG_BIT(16)
+#define DEV_ERR_STAT_SOC_ERROR REG_BIT(16)
#define DEV_ERR_STAT_SGUNIT_ERROR REG_BIT(12)
-#define DEV_ERR_STAT_GSC_ERROR REG_BIT(8)
-#define DEV_ERR_STAT_GT_ERROR REG_BIT(0)
+#define DEV_ERR_STAT_GSC_ERROR REG_BIT(8)
+#define DEV_ERR_STAT_GT_ERROR REG_BIT(0)
enum gt_vctr_registers {
ERR_STAT_GT_VCTR0 = 0,
@@ -148,11 +148,11 @@ enum gt_vctr_registers {
ERR_STAT_GT_VCTR7,
};
-#define ERR_STAT_GT_COR_VCTR_LEN (4)
-#define _ERR_STAT_GT_COR_VCTR_0 0x1002a0
-#define _ERR_STAT_GT_COR_VCTR_1 0x1002a4
-#define _ERR_STAT_GT_COR_VCTR_2 0x1002a8
-#define _ERR_STAT_GT_COR_VCTR_3 0x1002ac
+#define ERR_STAT_GT_COR_VCTR_LEN (4)
+#define _ERR_STAT_GT_COR_VCTR_0 0x1002a0
+#define _ERR_STAT_GT_COR_VCTR_1 0x1002a4
+#define _ERR_STAT_GT_COR_VCTR_2 0x1002a8
+#define _ERR_STAT_GT_COR_VCTR_3 0x1002ac
#define ERR_STAT_GT_COR_VCTR_REG(x) _MMIO(_PICK_EVEN((x), \
_ERR_STAT_GT_COR_VCTR_0, \
_ERR_STAT_GT_COR_VCTR_1))
@@ -170,19 +170,19 @@ enum gt_vctr_registers {
_ERR_STAT_GT_FATAL_VCTR_0, \
_ERR_STAT_GT_FATAL_VCTR_1))
-#define _ERR_STAT_GT_COR 0x100160
-#define _ERR_STAT_GT_NONFATAL 0x100164
-#define _ERR_STAT_GT_FATAL 0x100168
-#define ERR_STAT_GT_REG(x) _MMIO(_PICK_EVEN((x), \
+#define _ERR_STAT_GT_COR 0x100160
+#define _ERR_STAT_GT_NONFATAL 0x100164
+#define _ERR_STAT_GT_FATAL 0x100168
+#define ERR_STAT_GT_REG(x) _MMIO(_PICK_EVEN((x), \
_ERR_STAT_GT_COR, \
_ERR_STAT_GT_NONFATAL))
-#define EU_GRF_COR_ERR (15)
-#define EU_IC_COR_ERR (14)
-#define SLM_COR_ERR (13)
-#define SAMPLER_COR_ERR (12)
-#define GUC_COR_ERR (1)
-#define L3_SNG_COR_ERR (0)
+#define EU_GRF_COR_ERR (15)
+#define EU_IC_COR_ERR (14)
+#define SLM_COR_ERR (13)
+#define SAMPLER_COR_ERR (12)
+#define GUC_COR_ERR (1)
+#define L3_SNG_COR_ERR (0)
#define PVC_COR_ERR_MASK \
(REG_BIT(GUC_COR_ERR) | \
@@ -190,17 +190,17 @@ enum gt_vctr_registers {
REG_BIT(EU_IC_COR_ERR) | \
REG_BIT(EU_GRF_COR_ERR))
-#define EU_GRF_FAT_ERR (15)
-#define EU_IC_FAT_ERR (14)
-#define SLM_FAT_ERR (13)
-#define SAMPLER_FAT_ERR (12)
-#define SQIDI_FAT_ERR (9)
-#define IDI_PAR_FAT_ERR (8)
-#define GUC_FAT_ERR (6)
-#define L3_ECC_CHK_FAT_ERR (5)
-#define L3_DOUBLE_FAT_ERR (4)
-#define FPU_UNCORR_FAT_ERR (3)
-#define ARRAY_BIST_FAT_ERR (1)
+#define EU_GRF_FAT_ERR (15)
+#define EU_IC_FAT_ERR (14)
+#define SLM_FAT_ERR (13)
+#define SAMPLER_FAT_ERR (12)
+#define SQIDI_FAT_ERR (9)
+#define IDI_PAR_FAT_ERR (8)
+#define GUC_FAT_ERR (6)
+#define L3_ECC_CHK_FAT_ERR (5)
+#define L3_DOUBLE_FAT_ERR (4)
+#define FPU_UNCORR_FAT_ERR (3)
+#define ARRAY_BIST_FAT_ERR (1)
#define PVC_FAT_ERR_MASK \
(REG_BIT(FPU_UNCORR_FAT_ERR) | \
@@ -208,12 +208,122 @@ enum gt_vctr_registers {
REG_BIT(SLM_FAT_ERR) | \
REG_BIT(EU_GRF_FAT_ERR))
-#define GT_HW_ERROR_MAX_ERR_BITS 16
+#define GT_HW_ERROR_MAX_ERR_BITS 16
-#define _SLM_ECC_ERROR_CNT 0xe7f4
+#define _SLM_ECC_ERROR_CNT 0xe7f4
#define _SLM_UNCORR_ECC_ERROR_CNT 0xe7c0
-#define SLM_ECC_ERROR_CNTR(x) _MMIO((x) == HARDWARE_ERROR_CORRECTABLE ? \
+#define SLM_ECC_ERROR_CNTR(x) _MMIO((x) == HARDWARE_ERROR_CORRECTABLE ? \
_SLM_ECC_ERROR_CNT : \
_SLM_UNCORR_ECC_ERROR_CNT)
+#define SOC_PVC_BASE 0x00282000
+#define SOC_PVC_SLAVE_BASE 0x00283000
+
+#define _SOC_LERRCORSTS 0x000294
+#define _SOC_LERRUNCSTS 0x000280
+#define SOC_LOCAL_ERR_STAT_SLAVE_REG(base, x) _MMIO((x) > HARDWARE_ERROR_CORRECTABLE ? \
+ base + _SOC_LERRUNCSTS : \
+ base + _SOC_LERRCORSTS)
+#define SOC_FABRIC_SS1_3 (7)
+#define SOC_FABRIC_SS1_2 (6)
+#define SOC_FABRIC_SS1_1 (5)
+#define SOC_FABRIC_SS1_0 (4)
+
+#define SOC_LOCAL_ERR_STAT_MASTER_REG(base, x) _MMIO((x) > HARDWARE_ERROR_CORRECTABLE ? \
+ base + _SOC_LERRUNCSTS : \
+ base + _SOC_LERRCORSTS)
+#define PVC_SOC_PSF_2 (13)
+#define PVC_SOC_PSF_1 (12)
+#define PVC_SOC_PSF_0 (11)
+#define SOC_PSF_CSC_2 (10)
+#define SOC_PSF_CSC_1 (9)
+#define SOC_PSF_CSC_0 (8)
+#define SOC_FABRIC_SS0_3 (7)
+#define SOC_FABRIC_SS0_2 (6)
+#define SOC_FABRIC_SS0_1 (5)
+#define SOC_FABRIC_SS0_0 (4)
+
+#define _SOC_GSYSEVTCTL 0x000264
+#define SOC_GSYSEVTCTL_REG(base, slave_base, x) _MMIO(_PICK_EVEN((x), \
+ base + _SOC_GSYSEVTCTL, \
+ slave_base + _SOC_GSYSEVTCTL))
+#define _SOC_GCOERRSTS 0x000200
+#define _SOC_GNFERRSTS 0x000210
+#define _SOC_GFAERRSTS 0x000220
+#define SOC_GLOBAL_ERR_STAT_SLAVE_REG(base, x) _MMIO(_PICK_EVEN((x), \
+ base + _SOC_GCOERRSTS, \
+ base + _SOC_GNFERRSTS))
+#define PVC_SOC_HBM_SS3_7 (16)
+#define PVC_SOC_HBM_SS3_6 (15)
+#define PVC_SOC_HBM_SS3_5 (14)
+#define PVC_SOC_HBM_SS3_4 (13)
+#define PVC_SOC_HBM_SS3_3 (12)
+#define PVC_SOC_HBM_SS3_2 (11)
+#define PVC_SOC_HBM_SS3_1 (10)
+#define PVC_SOC_HBM_SS3_0 (9)
+#define PVC_SOC_HBM_SS2_7 (8)
+#define PVC_SOC_HBM_SS2_6 (7)
+#define PVC_SOC_HBM_SS2_5 (6)
+#define PVC_SOC_HBM_SS2_4 (5)
+#define PVC_SOC_HBM_SS2_3 (4)
+#define PVC_SOC_HBM_SS2_2 (3)
+#define PVC_SOC_HBM_SS2_1 (2)
+#define PVC_SOC_HBM_SS2_0 (1)
+#define SOC_HBM_SS1_15 (17)
+#define SOC_HBM_SS1_14 (16)
+#define SOC_HBM_SS1_13 (15)
+#define SOC_HBM_SS1_12 (14)
+#define SOC_HBM_SS1_11 (13)
+#define SOC_HBM_SS1_10 (12)
+#define SOC_HBM_SS1_9 (11)
+#define SOC_HBM_SS1_8 (10)
+#define SOC_HBM_SS1_7 (9)
+#define SOC_HBM_SS1_6 (8)
+#define SOC_HBM_SS1_5 (7)
+#define SOC_HBM_SS1_4 (6)
+#define SOC_HBM_SS1_3 (5)
+#define SOC_HBM_SS1_2 (4)
+#define SOC_HBM_SS1_1 (3)
+#define SOC_HBM_SS1_0 (2)
+#define SOC_FABRIC_SS1_4 (1)
+#define SOC_IEH1_LOCAL_ERR_STATUS (0)
+
+#define SOC_GLOBAL_ERR_STAT_MASTER_REG(base, x) _MMIO(_PICK_EVEN((x), \
+ base + _SOC_GCOERRSTS, \
+ base + _SOC_GNFERRSTS))
+#define PVC_SOC_MDFI_SOUTH (6)
+#define PVC_SOC_MDFI_EAST (4)
+#define PVC_SOC_CD0_MDFI (18)
+#define PVC_SOC_CD0 (17)
+#define PVC_SOC_HBM_SS1_7 (17)
+#define PVC_SOC_HBM_SS1_6 (16)
+#define PVC_SOC_HBM_SS1_5 (15)
+#define PVC_SOC_HBM_SS1_4 (14)
+#define PVC_SOC_HBM_SS1_3 (13)
+#define PVC_SOC_HBM_SS1_2 (12)
+#define PVC_SOC_HBM_SS1_1 (11)
+#define PVC_SOC_HBM_SS1_0 (10)
+#define SOC_MDFI_SOUTH (21)
+#define SOC_MDFI_WEST (20)
+#define SOC_MDFI_EAST (19)
+#define SOC_PUNIT (18)
+#define SOC_HBM_SS0_15 (17)
+#define SOC_HBM_SS0_14 (16)
+#define SOC_HBM_SS0_13 (15)
+#define SOC_HBM_SS0_12 (14)
+#define SOC_HBM_SS0_11 (13)
+#define SOC_HBM_SS0_10 (12)
+#define SOC_HBM_SS0_9 (11)
+#define SOC_HBM_SS0_8 (10)
+#define SOC_HBM_SS0_7 (9)
+#define SOC_HBM_SS0_6 (8)
+#define SOC_HBM_SS0_5 (7)
+#define SOC_HBM_SS0_4 (6)
+#define SOC_HBM_SS0_3 (5)
+#define SOC_HBM_SS0_2 (4)
+#define SOC_HBM_SS0_1 (3)
+#define SOC_HBM_SS0_0 (2)
+#define SOC_SLAVE_IEH (1)
+#define SOC_IEH0_LOCAL_ERR_STATUS (0)
+#define SOC_HW_ERR_MAX_BITS (32)
#endif
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 1553949d12b6..d4a4cea398d3 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -336,7 +336,12 @@ int xe_device_probe(struct xe_device *xe)
static void xe_device_remove_display(struct xe_device *xe)
{
+ struct xe_gt *gt;
+ u32 id;
+
xe_display_unregister(xe);
+ for_each_gt(gt,xe,id)
+ xa_destroy(>->errors.soc);
drm_dev_unplug(&xe->drm);
xe_display_modset_driver_remove(xe);
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index e943810eeb53..7d81e0b19389 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -606,6 +606,7 @@ int xe_gt_init(struct xe_gt *gt)
int i;
spin_lock_init(>->irq_lock);
+ xa_init(>->errors.soc);
INIT_WORK(>->reset.worker, gt_reset_worker);
for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) {
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h
index d15724792082..425d8129843c 100644
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -6,6 +6,7 @@
#ifndef _XE_GT_TYPES_H_
#define _XE_GT_TYPES_H_
+#include "regs/xe_regs.h"
#include "xe_force_wake_types.h"
#include "xe_hw_engine_types.h"
#include "xe_hw_fence_types.h"
@@ -66,6 +67,17 @@ enum xe_gt_driver_errors {
INTEL_GT_DRIVER_ERROR_COUNT
};
+enum intel_soc_num_ieh {
+ INTEL_GT_SOC_IEH0 = 0,
+ INTEL_GT_SOC_IEH1,
+ INTEL_GT_SOC_NUM_IEH
+};
+
+enum intel_soc_ieh_reg_type {
+ INTEL_SOC_REG_LOCAL = 0,
+ INTEL_SOC_REG_GLOBAL
+};
+
void xe_gt_log_driver_error(struct xe_gt *gt,
const enum xe_gt_driver_errors error,
const char *fmt, ...);
@@ -387,9 +399,25 @@ struct xe_gt {
struct intel_hw_errors {
unsigned long hw[INTEL_GT_HW_ERROR_COUNT];
+ struct xarray soc;
+ unsigned long sgunit[HARDWARE_ERROR_MAX];
unsigned long driver[INTEL_GT_DRIVER_ERROR_COUNT];
} errors;
};
+#define SOC_HW_ERR_SHIFT ilog2(SOC_HW_ERR_MAX_BITS)
+#define SOC_ERR_BIT BIT(IEH_SHIFT + 1)
+#define IEH_SHIFT (REG_GROUP_SHIFT + REG_GROUP_BITS)
+#define IEH_MASK (0x1)
+#define REG_GROUP_SHIFT (HW_ERR_TYPE_BITS + SOC_HW_ERR_SHIFT)
+#define REG_GROUP_BITS (1)
+#define HW_ERR_TYPE_BITS (2)
+#define SOC_ERR_INDEX(IEH, REG_GROUP, HW_ERR, ERRBIT) \
+ (SOC_ERR_BIT | \
+ (IEH) << IEH_SHIFT | \
+ (REG_GROUP) << REG_GROUP_SHIFT | \
+ (HW_ERR) << SOC_HW_ERR_SHIFT | \
+ (ERRBIT))
+
#endif
diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
index 4a3b3ff99db1..819d39bdfecd 100644
--- a/drivers/gpu/drm/xe/xe_irq.c
+++ b/drivers/gpu/drm/xe/xe_irq.c
@@ -639,6 +639,241 @@ gen12_gt_hw_error_handler(struct xe_gt *gt,
xe_mmio_write32(gt, ERR_STAT_GT_REG(hw_err).reg, errstat);
}
+static const char *
+soc_err_index_to_str(unsigned long index)
+{
+ switch (index) {
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_0):
+ return "PSF CSC0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_1):
+ return "PSF CSC1";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_2):
+ return "PSF CSC2";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_0):
+ return "PSF0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_1):
+ return "PSF1";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_2):
+ return "PSF2";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_CD0):
+ return "CD0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_CD0_MDFI):
+ return "CD0 MDFI";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_MDFI_EAST):
+ return "MDFI EAST";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_MDFI_SOUTH):
+ return "MDFI SOUTH";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_PUNIT):
+ return "PUNIT";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_0):
+ return "HBM SS0: Sbbridge0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_1):
+ return "HBM SS0: Sbbridge1";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_2):
+ return "HBM SS0: Sbbridge2";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_3):
+ return "HBM SS0: Sbbridge3";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_4):
+ return "HBM SS0: Sbbridge4";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_5):
+ return "HBM SS0: Sbbridge5";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_6):
+ return "HBM SS0: Sbbridge6";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_7):
+ return "HBM SS0: Sbbridge7";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_0):
+ return "HBM SS1: Sbbridge0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_1):
+ return "HBM SS1: Sbbridge1";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_2):
+ return "HBM SS1: Sbbridge2";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_3):
+ return "HBM SS1: Sbbridge3";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_4):
+ return "HBM SS1: Sbbridge4";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_5):
+ return "HBM SS1: Sbbridge5";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_6):
+ return "HBM SS1: Sbbridge6";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_7):
+ return "HBM SS1: Sbbridge7";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_0):
+ return "HBM SS2: Sbbridge0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_1):
+ return "HBM SS2: Sbbridge1";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_2):
+ return "HBM SS2: Sbbridge2";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_3):
+ return "HBM SS2: Sbbridge3";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_4):
+ return "HBM SS2: Sbbridge4";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_5):
+ return "HBM SS2: Sbbridge5";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_6):
+ return "HBM SS2: Sbbridge6";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_7):
+ return "HBM SS2: Sbbridge7";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_0):
+ return "HBM SS3: Sbbridge0";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_1):
+ return "HBM SS3: Sbbridge1";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_2):
+ return "HBM SS3: Sbbridge2";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_3):
+ return "HBM SS3: Sbbridge3";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_4):
+ return "HBM SS3: Sbbridge4";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_5):
+ return "HBM SS3: Sbbridge5";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_6):
+ return "HBM SS3: Sbbridge6";
+ case SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_7):
+ return "HBM SS3: Sbbridge7";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void update_soc_hw_error_cnt(struct xe_gt *gt, unsigned long index)
+{
+ unsigned long flags;
+ void *entry;
+
+ entry = xa_load(>->errors.soc, index);
+ entry = xa_mk_value(xa_to_value(entry) + 1);
+
+ xa_lock_irqsave(>->errors.soc, flags);
+ if (xa_is_err(__xa_store(>->errors.soc, index, entry, GFP_ATOMIC)))
+ drm_err_ratelimited(>->xe->drm,
+ HW_ERR "SOC error reported by IEH%lu on GT %d lost\n",
+ (index >> IEH_SHIFT) & IEH_MASK,
+ gt->info.id);
+ xa_unlock_irqrestore(>->errors.soc, flags);
+}
+
+static void
+gen12_soc_hw_error_handler(struct xe_gt *gt,
+ const enum hardware_error hw_err)
+{
+ unsigned long mst_glb_errstat, slv_glb_errstat, lcl_errstat, index;
+ u32 errbit, base, slave_base;
+ int i;
+
+ lockdep_assert_held(>->irq_lock);
+ if (gt->xe->info.platform != XE_PVC)
+ return;
+
+ base = SOC_PVC_BASE;
+ slave_base = SOC_PVC_SLAVE_BASE;
+
+ xe_gt_hw_err(gt, "SOC %s error\n", hardware_error_type_to_str(hw_err));
+
+ if ((hw_err == HARDWARE_ERROR_CORRECTABLE) || (hw_err ==
+ HARDWARE_ERROR_NONFATAL))
+ {
+ for (i = 0; i < INTEL_GT_SOC_NUM_IEH; i++)
+ xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i).reg,
+ ~REG_BIT(hw_err));
+
+ xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err).reg,
+ REG_GENMASK(31, 0));
+ xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err).reg,
+ REG_GENMASK(31, 0));
+ xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err).reg,
+ REG_GENMASK(31, 0));
+ xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err).reg,
+ REG_GENMASK(31, 0));
+
+ xe_gt_log_driver_error(gt, INTEL_GT_DRIVER_ERROR_INTERRUPT, "UNKNOWN SOC %s error\n", hardware_error_type_to_str(hw_err));
+ }
+
+ /*
+ * Mask error type in GSYSEVTCTL so that no new errors of the type
+ * will be reported. Read the master global IEH error register if
+ * BIT 1 is set then process the slave IEH first. If BIT 0 in
+ * global error register is set then process the corresponding
+ * Local error registers
+ */
+ for (i = 0; i < INTEL_GT_SOC_NUM_IEH; i++)
+ xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i).reg, ~REG_BIT(hw_err));
+
+ mst_glb_errstat = xe_mmio_read32(gt,
+ SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err).reg);
+ xe_gt_hw_err(gt, "SOC_GLOBAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n", mst_glb_errstat);
+ if (mst_glb_errstat & REG_BIT(SOC_SLAVE_IEH)) {
+ slv_glb_errstat = xe_mmio_read32(gt,
+ SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base,
+ hw_err).reg);
+ xe_gt_hw_err(gt, "SOC_GLOBAL_ERR_STAT_SLAVE_REG_FATAL:0x%08lx\n",
+ slv_glb_errstat);
+
+ if (slv_glb_errstat & REG_BIT(SOC_IEH1_LOCAL_ERR_STATUS)) {
+ lcl_errstat = xe_mmio_read32(gt,
+ SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base,
+ hw_err).reg);
+ xe_gt_hw_err(gt, "SOC_LOCAL_ERR_STAT_SLAVE_REG_FATAL:0x%08lx\n",
+ lcl_errstat);
+
+ for_each_set_bit(errbit, &lcl_errstat,
+ SOC_HW_ERR_MAX_BITS) {
+ /*
+ * SOC errors have global and local error
+ * registers for each correctable non-fatal
+ * and fatal categories and these are per IEH
+ * on platform. XEHPSDV and PVC have two IEHs
+ */
+ index = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_LOCAL, hw_err, errbit);
+ update_soc_hw_error_cnt(gt, index);
+ if (gt->xe->info.platform == XE_PVC)
+ xe_gt_hw_err(gt, "%s SOC FATAL error\n",
+ soc_err_index_to_str(index));
+ }
+ xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_SLAVE_REG(slave_base, hw_err).reg,
+ lcl_errstat);
+ }
+
+ for_each_set_bit(errbit, &slv_glb_errstat, SOC_HW_ERR_MAX_BITS) {
+ index = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, hw_err, errbit);
+ update_soc_hw_error_cnt(gt, index);
+ if (gt->xe->info.platform == XE_PVC)
+ xe_gt_hw_err(gt, "%s SOC FATAL error\n",
+ soc_err_index_to_str(index));
+ }
+ xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_SLAVE_REG(slave_base, hw_err).reg,
+ slv_glb_errstat);
+ }
+
+ if (mst_glb_errstat & REG_BIT(SOC_IEH0_LOCAL_ERR_STATUS)) {
+ lcl_errstat = xe_mmio_read32(gt,
+ SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err).reg);
+ xe_gt_hw_err(gt, "SOC_LOCAL_ERR_STAT_MASTER_REG_FATAL:0x%08lx\n", lcl_errstat);
+ for_each_set_bit(errbit, &lcl_errstat, SOC_HW_ERR_MAX_BITS) {
+ index = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, hw_err, errbit);
+ update_soc_hw_error_cnt(gt, index);
+ if (gt->xe->info.platform == XE_PVC)
+ xe_gt_hw_err(gt, "%s SOC FATAL error\n",
+ soc_err_index_to_str(index));
+ }
+ xe_mmio_write32(gt, SOC_LOCAL_ERR_STAT_MASTER_REG(base, hw_err).reg,
+ lcl_errstat);
+ }
+
+ for_each_set_bit(errbit, &mst_glb_errstat, SOC_HW_ERR_MAX_BITS) {
+ index = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, hw_err, errbit);
+ update_soc_hw_error_cnt(gt, index);
+ if (gt->xe->info.platform == XE_PVC)
+ xe_gt_hw_err(gt, "%s SOC FATAL error\n",
+ soc_err_index_to_str(index));
+ }
+ xe_mmio_write32(gt, SOC_GLOBAL_ERR_STAT_MASTER_REG(base, hw_err).reg,
+ mst_glb_errstat);
+
+ for (i = 0; i < INTEL_GT_SOC_NUM_IEH; i++)
+ xe_mmio_write32(gt, SOC_GSYSEVTCTL_REG(base, slave_base, i).reg,
+ (HARDWARE_ERROR_MAX << 1) + 1);
+}
+
+
static void
gen12_hw_error_source_handler(struct xe_gt *gt,
const enum hardware_error hw_err)
@@ -657,6 +892,12 @@ gen12_hw_error_source_handler(struct xe_gt *gt,
if (errsrc & DEV_ERR_STAT_GT_ERROR)
gen12_gt_hw_error_handler(gt, hw_err);
+
+ if (errsrc & DEV_ERR_STAT_SGUNIT_ERROR)
+ gt->errors.sgunit[hw_err]++;
+
+ if (errsrc & DEV_ERR_STAT_SOC_ERROR)
+ gen12_soc_hw_error_handler(gt, hw_err);
xe_mmio_write32(gt, DEV_ERR_STAT_REG(hw_err).reg, errsrc);
--
2.25.1
More information about the Intel-xe
mailing list