[Intel-xe] [RFC 3/5] drm/xe/RAS: Expose the error counters
Aravind Iddamsetty
aravind.iddamsetty at intel.com
Fri May 26 16:20:14 UTC 2023
We expose the various error counters supported on a hardware via genl
subsystem through the registered commands to userspace.
The DRM_CMD_QUERY lists the error names with config id, DRM_CMD_READ_ONE
returns the counter value for the requested config id and the
DRM_CMD_READ_ALL list the counters for all errors along with their names
and config ids.
Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
---
drivers/gpu/drm/xe/xe_netlink.c | 439 +++++++++++++++++++++++++++++++-
include/uapi/drm/xe_drm.h | 64 +++++
2 files changed, 501 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_netlink.c b/drivers/gpu/drm/xe/xe_netlink.c
index 63ef238ebc27..2a6965f5cde9 100644
--- a/drivers/gpu/drm/xe/xe_netlink.c
+++ b/drivers/gpu/drm/xe/xe_netlink.c
@@ -4,19 +4,451 @@
*/
#include <drm/drm_managed.h>
+#include <drm/xe_drm.h>
#include "xe_device.h"
+#define MAX_ERROR_NAME 50
+
+#define HAS_GT_ERROR_VECTORS(xe) ((xe)->info.has_gt_error_vectors)
+#define HAS_MEM_SPARING_SUPPORT(xe) ((xe)->info.has_mem_sparing)
+
DEFINE_XARRAY(xe_xarray);
-static int xe_genl_list_errors(struct sk_buff *msg, struct genl_info *info)
+static const char * const xe_hw_error_events[] = {
+ [XE_GT_ERROR_CORRECTABLE_L3_SNG] = "correctable-l3-sng",
+ [XE_GT_ERROR_CORRECTABLE_GUC] = "correctable-guc",
+ [XE_GT_ERROR_CORRECTABLE_SAMPLER] = "correctable-sampler",
+ [XE_GT_ERROR_CORRECTABLE_SLM] = "correctable-slm",
+ [XE_GT_ERROR_CORRECTABLE_EU_IC] = "correctable-eu-ic",
+ [XE_GT_ERROR_CORRECTABLE_EU_GRF] = "correctable-eu-grf",
+ [XE_GT_ERROR_FATAL_ARR_BIST] = "fatal-array-bist",
+ [XE_GT_ERROR_FATAL_L3_DOUB] = "fatal-l3-double",
+ [XE_GT_ERROR_FATAL_L3_ECC_CHK] = "fatal-l3-ecc-checker",
+ [XE_GT_ERROR_FATAL_GUC] = "fatal-guc",
+ [XE_GT_ERROR_FATAL_IDI_PAR] = "fatal-idi-parity",
+ [XE_GT_ERROR_FATAL_SQIDI] = "fatal-sqidi",
+ [XE_GT_ERROR_FATAL_SAMPLER] = "fatal-sampler",
+ [XE_GT_ERROR_FATAL_SLM] = "fatal-slm",
+ [XE_GT_ERROR_FATAL_EU_IC] = "fatal-eu-ic",
+ [XE_GT_ERROR_FATAL_EU_GRF] = "fatal-eu-grf",
+ [XE_GT_ERROR_FATAL_FPU] = "fatal-fpu",
+ [XE_GT_ERROR_FATAL_TLB] = "fatal-tlb",
+ [XE_GT_ERROR_FATAL_L3_FABRIC] = "fatal-l3-fabric",
+ [XE_GT_ERROR_CORRECTABLE_SUBSLICE] = "correctable-subslice",
+ [XE_GT_ERROR_CORRECTABLE_L3BANK] = "correctable-l3bank",
+ [XE_GT_ERROR_FATAL_SUBSLICE] = "fatal-subslice",
+ [XE_GT_ERROR_FATAL_L3BANK] = "fatal-l3bank",
+ [XE_SGUNIT_ERROR_CORRECTABLE] = "sgunit-correctable",
+ [XE_SGUNIT_ERROR_NONFATAL] = "sgunit-nonfatal",
+ [XE_SGUNIT_ERROR_FATAL] = "sgunit-fatal",
+ [XE_SOC_ERROR_FATAL_PSF_CSC_0] = "soc-fatal-psf-csc-0",
+ [XE_SOC_ERROR_FATAL_PSF_CSC_1] = "soc-fatal-psf-csc-1",
+ [XE_SOC_ERROR_FATAL_PSF_CSC_2] = "soc-fatal-psf-csc-2",
+ [XE_SOC_ERROR_FATAL_PUNIT] = "soc-fatal-punit",
+ [XE_PVC_SOC_ERROR_FATAL_PSF_0] = "soc-fatal-psf-0",
+ [XE_PVC_SOC_ERROR_FATAL_PSF_1] = "soc-fatal-psf-1",
+ [XE_PVC_SOC_ERROR_FATAL_PSF_2] = "soc-fatal-psf-2",
+ [XE_PVC_SOC_ERROR_FATAL_CD0] = "soc-fatal-cd0",
+ [XE_PVC_SOC_ERROR_FATAL_CD0_MDFI] = "soc-fatal-cd0-mdfi",
+ [XE_PVC_SOC_ERROR_FATAL_MDFI_EAST] = "soc-fatal-mdfi-east",
+ [XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH] = "soc-fatal-mdfi-south",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 0)] = "soc-fatal-hbm-ss0-0",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 1)] = "soc-fatal-hbm-ss0-1",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 2)] = "soc-fatal-hbm-ss0-2",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 3)] = "soc-fatal-hbm-ss0-3",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 4)] = "soc-fatal-hbm-ss0-4",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 5)] = "soc-fatal-hbm-ss0-5",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 6)] = "soc-fatal-hbm-ss0-6",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 7)] = "soc-fatal-hbm-ss0-7",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 8)] = "soc-fatal-hbm-ss1-0",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 9)] = "soc-fatal-hbm-ss1-1",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 10)] = "soc-fatal-hbm-ss1-2",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 11)] = "soc-fatal-hbm-ss1-3",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 12)] = "soc-fatal-hbm-ss1-4",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 13)] = "soc-fatal-hbm-ss1-5",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 14)] = "soc-fatal-hbm-ss1-6",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 15)] = "soc-fatal-hbm-ss1-7",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 0)] = "soc-fatal-hbm-ss2-0",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 1)] = "soc-fatal-hbm-ss2-1",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 2)] = "soc-fatal-hbm-ss2-2",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 3)] = "soc-fatal-hbm-ss2-3",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 4)] = "soc-fatal-hbm-ss2-4",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 5)] = "soc-fatal-hbm-ss2-5",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 6)] = "soc-fatal-hbm-ss2-6",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 7)] = "soc-fatal-hbm-ss2-7",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 8)] = "soc-fatal-hbm-ss3-0",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 9)] = "soc-fatal-hbm-ss3-1",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 10)] = "soc-fatal-hbm-ss3-2",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 11)] = "soc-fatal-hbm-ss3-3",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 12)] = "soc-fatal-hbm-ss3-4",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 13)] = "soc-fatal-hbm-ss3-5",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 14)] = "soc-fatal-hbm-ss3-6",
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 15)] = "soc-fatal-hbm-ss3-7",
+ [XE_GSC_ERROR_CORRECTABLE_SRAM_ECC] = "gsc-correctable-sram-ecc",
+ [XE_GSC_ERROR_NONFATAL_MIA_SHUTDOWN] = "gsc-nonfatal-mia-shutdown",
+ [XE_GSC_ERROR_NONFATAL_MIA_INT] = "gsc-nonfatal-mia-int",
+ [XE_GSC_ERROR_NONFATAL_SRAM_ECC] = "gsc-nonfatal-sram-ecc",
+ [XE_GSC_ERROR_NONFATAL_WDG_TIMEOUT] = "gsc-nonfatal-wdg-timeout",
+ [XE_GSC_ERROR_NONFATAL_ROM_PARITY] = "gsc-nonfatal-rom-parity",
+ [XE_GSC_ERROR_NONFATAL_UCODE_PARITY] = "gsc-nonfatal-ucode-parity",
+ [XE_GSC_ERROR_NONFATAL_GLITCH_DET] = "gsc-nonfatal-glitch-det",
+ [XE_GSC_ERROR_NONFATAL_FUSE_PULL] = "gsc-nonfatal-fuse-pull",
+ [XE_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK] = "gsc-nonfatal-fuse-crc-check",
+ [XE_GSC_ERROR_NONFATAL_FUSE_SELFMBIST] = "gsc-nonfatal-selfmbist",
+ [XE_GSC_ERROR_NONFATAL_AON_PARITY] = "gsc-nonfatal-aon-parity",
+};
+
+static const unsigned long xe_hw_error_map[] = {
+ [XE_GT_ERROR_CORRECTABLE_L3_SNG] = INTEL_GT_HW_ERROR_COR_L3_SNG,
+ [XE_GT_ERROR_CORRECTABLE_GUC] = INTEL_GT_HW_ERROR_COR_GUC,
+ [XE_GT_ERROR_CORRECTABLE_SAMPLER] = INTEL_GT_HW_ERROR_COR_SAMPLER,
+ [XE_GT_ERROR_CORRECTABLE_SLM] = INTEL_GT_HW_ERROR_COR_SLM,
+ [XE_GT_ERROR_CORRECTABLE_EU_IC] = INTEL_GT_HW_ERROR_COR_EU_IC,
+ [XE_GT_ERROR_CORRECTABLE_EU_GRF] = INTEL_GT_HW_ERROR_COR_EU_GRF,
+ [XE_GT_ERROR_FATAL_ARR_BIST] = INTEL_GT_HW_ERROR_FAT_ARR_BIST,
+ [XE_GT_ERROR_FATAL_L3_DOUB] = INTEL_GT_HW_ERROR_FAT_L3_DOUB,
+ [XE_GT_ERROR_FATAL_L3_ECC_CHK] = INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK,
+ [XE_GT_ERROR_FATAL_GUC] = INTEL_GT_HW_ERROR_FAT_GUC,
+ [XE_GT_ERROR_FATAL_IDI_PAR] = INTEL_GT_HW_ERROR_FAT_IDI_PAR,
+ [XE_GT_ERROR_FATAL_SQIDI] = INTEL_GT_HW_ERROR_FAT_SQIDI,
+ [XE_GT_ERROR_FATAL_SAMPLER] = INTEL_GT_HW_ERROR_FAT_SAMPLER,
+ [XE_GT_ERROR_FATAL_SLM] = INTEL_GT_HW_ERROR_FAT_SLM,
+ [XE_GT_ERROR_FATAL_EU_IC] = INTEL_GT_HW_ERROR_FAT_EU_IC,
+ [XE_GT_ERROR_FATAL_EU_GRF] = INTEL_GT_HW_ERROR_FAT_EU_GRF,
+ [XE_GT_ERROR_FATAL_FPU] = INTEL_GT_HW_ERROR_FAT_FPU,
+ [XE_GT_ERROR_FATAL_TLB] = INTEL_GT_HW_ERROR_FAT_TLB,
+ [XE_GT_ERROR_FATAL_L3_FABRIC] = INTEL_GT_HW_ERROR_FAT_L3_FABRIC,
+ [XE_GT_ERROR_CORRECTABLE_SUBSLICE] = INTEL_GT_HW_ERROR_COR_SUBSLICE,
+ [XE_GT_ERROR_CORRECTABLE_L3BANK] = INTEL_GT_HW_ERROR_COR_L3BANK,
+ [XE_GT_ERROR_FATAL_SUBSLICE] = INTEL_GT_HW_ERROR_FAT_SUBSLICE,
+ [XE_GT_ERROR_FATAL_L3BANK] = INTEL_GT_HW_ERROR_FAT_L3BANK,
+ [XE_SGUNIT_ERROR_CORRECTABLE] = HARDWARE_ERROR_CORRECTABLE,
+ [XE_SGUNIT_ERROR_NONFATAL] = HARDWARE_ERROR_NONFATAL,
+ [XE_SGUNIT_ERROR_FATAL] = HARDWARE_ERROR_FATAL,
+ [XE_SOC_ERROR_FATAL_PSF_CSC_0] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_0),
+ [XE_SOC_ERROR_FATAL_PSF_CSC_1] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_1),
+ [XE_SOC_ERROR_FATAL_PSF_CSC_2] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_2),
+ [XE_SOC_ERROR_FATAL_PUNIT] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_PUNIT),
+ [XE_PVC_SOC_ERROR_FATAL_PSF_0] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_0),
+ [XE_PVC_SOC_ERROR_FATAL_PSF_1] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_1),
+ [XE_PVC_SOC_ERROR_FATAL_PSF_2] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_2),
+ [XE_PVC_SOC_ERROR_FATAL_CD0] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_CD0),
+ [XE_PVC_SOC_ERROR_FATAL_CD0_MDFI] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_CD0_MDFI),
+ [XE_PVC_SOC_ERROR_FATAL_MDFI_EAST] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_MDFI_EAST),
+ [XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_MDFI_SOUTH),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 0)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_0),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 1)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_1),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 2)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_2),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 3)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_3),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 4)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_4),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 5)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_5),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 6)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_6),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 7)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_7),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 8)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_0),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 9)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_1),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 10)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_2),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 11)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_3),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 12)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_4),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 13)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_5),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 14)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_6),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(0, 15)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_7),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 0)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_0),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 1)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_1),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 2)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_2),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 3)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_3),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 4)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_4),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 5)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_5),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 6)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_6),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 7)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_7),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 8)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_0),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 9)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_1),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 10)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_2),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 11)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_3),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 12)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_4),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 13)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_5),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 14)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_6),
+ [XE_PVC_SOC_ERROR_FATAL_HBM(1, 15)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_7),
+ [XE_GSC_ERROR_CORRECTABLE_SRAM_ECC] = INTEL_GSC_HW_ERROR_COR_SRAM_ECC,
+ [XE_GSC_ERROR_NONFATAL_MIA_SHUTDOWN] = INTEL_GSC_HW_ERROR_UNCOR_MIA_SHUTDOWN,
+ [XE_GSC_ERROR_NONFATAL_MIA_INT] = INTEL_GSC_HW_ERROR_UNCOR_MIA_INT,
+ [XE_GSC_ERROR_NONFATAL_SRAM_ECC] = INTEL_GSC_HW_ERROR_UNCOR_SRAM_ECC,
+ [XE_GSC_ERROR_NONFATAL_WDG_TIMEOUT] = INTEL_GSC_HW_ERROR_UNCOR_WDG_TIMEOUT,
+ [XE_GSC_ERROR_NONFATAL_ROM_PARITY] = INTEL_GSC_HW_ERROR_UNCOR_ROM_PARITY,
+ [XE_GSC_ERROR_NONFATAL_UCODE_PARITY] = INTEL_GSC_HW_ERROR_UNCOR_UCODE_PARITY,
+ [XE_GSC_ERROR_NONFATAL_GLITCH_DET] = INTEL_GSC_HW_ERROR_UNCOR_GLITCH_DET,
+ [XE_GSC_ERROR_NONFATAL_FUSE_PULL] = INTEL_GSC_HW_ERROR_UNCOR_FUSE_PULL,
+ [XE_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK] = INTEL_GSC_HW_ERROR_UNCOR_FUSE_CRC_CHECK,
+ [XE_GSC_ERROR_NONFATAL_FUSE_SELFMBIST] = INTEL_GSC_HW_ERROR_UNCOR_SELFMBIST,
+ [XE_GSC_ERROR_NONFATAL_AON_PARITY] = INTEL_GSC_HW_ERROR_UNCOR_AON_PARITY,
+};
+
+static unsigned int config_gt_id(const u64 config)
+{
+ return config >> __XE_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+ return config & ~(~0ULL << __XE_GT_SHIFT);
+}
+
+static bool is_gt_vector_error(const u64 config)
{
+ unsigned int error;
+
+ error = config_counter(config);
+ if (error >= XE_GT_ERROR_FATAL_TLB &&
+ error <= XE_GT_ERROR_FATAL_L3BANK)
+ return true;
+
+ return false;
+}
+
+static bool is_pvc_invalid_gt_errors(const u64 config)
+{
+ switch (config_counter(config)) {
+ case XE_GT_ERROR_CORRECTABLE_L3_SNG:
+ case XE_GT_ERROR_CORRECTABLE_SAMPLER:
+ case XE_GT_ERROR_FATAL_ARR_BIST:
+ case XE_GT_ERROR_FATAL_L3_DOUB:
+ case XE_GT_ERROR_FATAL_L3_ECC_CHK:
+ case XE_GT_ERROR_FATAL_IDI_PAR:
+ case XE_GT_ERROR_FATAL_SQIDI:
+ case XE_GT_ERROR_FATAL_SAMPLER:
+ case XE_GT_ERROR_FATAL_EU_IC:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool is_gsc_hw_error(const u64 config)
+{
+ if (config_counter(config) >= XE_GSC_ERROR_CORRECTABLE_SRAM_ECC &&
+ config_counter(config) <= XE_GSC_ERROR_NONFATAL_AON_PARITY)
+ return true;
+
+ return false;
+}
+
+static bool is_soc_error(const u64 config)
+{
+ if (config_counter(config) >= XE_SOC_ERROR_FATAL_PSF_CSC_0 &&
+ config_counter(config) <= XE_PVC_SOC_ERROR_FATAL_HBM(1, 15))
+ return true;
+
+ return false;
+}
+
+static int
+config_status(struct xe_device *xe, u64 config)
+{
+ unsigned int gt_id = config_gt_id(config);
+
+ if (!IS_DGFX(xe))
+ return -ENODEV;
+
+ if (xe->gt[gt_id].info.type == XE_GT_TYPE_UNINITIALIZED)
+ return -ENOENT;
+
+ /* GSC HW ERRORS are present on root tile of
+ * platform supporting MEMORY SPARING only
+ */
+ if (is_gsc_hw_error(config) && !(HAS_MEM_SPARING_SUPPORT(xe) && gt_id == 0))
+ return -ENODEV;
+
+ /* GT vectors error are valid on Platforms supporting error vectors only */
+ if (is_gt_vector_error(config) && !HAS_GT_ERROR_VECTORS(xe))
+ return -ENODEV;
+
+ /* Skip gt errors not supported on pvc */
+ if (is_pvc_invalid_gt_errors(config) && (xe->info.platform == XE_PVC))
+ return -ENODEV;
+
+ /* FATAL FPU error is valid on PVC only */
+ if (config_counter(config) == XE_GT_ERROR_FATAL_FPU &&
+ !(xe->info.platform == XE_PVC))
+ return -ENODEV;
+
+ if (is_soc_error(config) && !(xe->info.platform == XE_PVC))
+ return -ENODEV;
+
+ return (config_counter(config) >=
+ ARRAY_SIZE(xe_hw_error_map)) ? -ENOENT : 0;
+}
+
+static u64 get_counter_value(struct xe_device *xe, u64 config)
+{
+ const unsigned int gt_id = config_gt_id(config);
+ unsigned int id = config_counter(config);
+
+ if (is_soc_error(config))
+ return xa_to_value(xa_load(&xe->gt[gt_id].errors.soc, xe_hw_error_map[id]));
+ else if (is_gsc_hw_error(config))
+ return xe->gt[gt_id].errors.gsc_hw[xe_hw_error_map[id]];
+ else if (id >= XE_SGUNIT_ERROR_CORRECTABLE &&
+ id <= XE_SGUNIT_ERROR_FATAL)
+ return xe->gt[gt_id].errors.sgunit[xe_hw_error_map[id]];
+ else
+ return xe->gt[gt_id].errors.hw[xe_hw_error_map[id]];
+
return 0;
}
-static int xe_genl_read_error(struct sk_buff *msg, struct genl_info *info)
+static struct xe_device *genl_to_xe(struct genl_info *info)
+{
+ return xa_load(&xe_xarray, info->nlhdr->nlmsg_type);
+}
+
+static int xe_genl_send(struct sk_buff *msg, struct genl_info *info, void *usrhdr)
{
+ int ret;
+
+ genlmsg_end(msg, usrhdr);
+
+ ret = genlmsg_reply(msg, info);
+ if (ret)
+ nlmsg_free(msg);
+
+ return ret;
+}
+
+static struct sk_buff *
+xe_genl_alloc_msg(struct xe_device *xe,
+ struct genl_info *info,
+ size_t msg_size, void **usrhdr)
+{
+ struct sk_buff *new_msg;
+
+ new_msg = genlmsg_new(msg_size, GFP_KERNEL);
+ if (!new_msg)
+ return new_msg;
+
+ *usrhdr = genlmsg_put_reply(new_msg, info, &xe->xe_genl_family, 0, info->genlhdr->cmd);
+ if (!*usrhdr) {
+ nlmsg_free(new_msg);
+ new_msg = NULL;
+ }
+
+ return new_msg;
+}
+
+int fill_error_details(struct genl_info *info, struct sk_buff *new_msg)
+{
+ struct xe_device *xe = genl_to_xe(info);
+ struct nlattr *entry_attr;
+ struct xe_gt *gt;
+ int i, j;
+ bool counter = false;
+
+ if (info->genlhdr->cmd == DRM_CMD_READ_ALL)
+ counter = true;
+
+ entry_attr = nla_nest_start(new_msg, DRM_ATTR_QUERY_REPLY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ for_each_gt(gt, xe, j) {
+ char str[MAX_ERROR_NAME];
+ u64 val;
+
+ for (i = 0; i < ARRAY_SIZE(xe_hw_error_events); i++) {
+ u64 config = XE_HW_ERROR(j, i);
+
+ if (config_status(xe, config))
+ continue;
+
+ /* should this be cleared everytime */
+ snprintf(str, sizeof(str), "error-gt%d-%s", j, xe_hw_error_events[i]);
+
+ if (nla_put_string(new_msg, DRM_ATTR_ERROR_NAME, str))
+ goto err;
+ if (nla_put_u64_64bit(new_msg, DRM_ATTR_ERROR_ID, config, DRM_ATTR_PAD))
+ goto err;
+ if (counter) {
+ val = get_counter_value(xe, config);
+ if (nla_put_u64_64bit(new_msg, DRM_ATTR_ERROR_VALUE, val, DRM_ATTR_PAD))
+ goto err;
+ }
+ }
+ }
+
+ nla_nest_end(new_msg, entry_attr);
+
return 0;
+err:
+ drm_dbg_driver(&xe->drm, "msg buff is small\n");
+ nla_nest_cancel(new_msg, entry_attr);
+ nlmsg_free(new_msg);
+
+ return -EMSGSIZE;
+}
+
+static int xe_genl_list_errors(struct sk_buff *msg, struct genl_info *info)
+{
+ struct xe_device *xe = genl_to_xe(info);
+ size_t msg_size = NLMSG_DEFAULT_SIZE;
+ struct sk_buff *new_msg;
+ void *usrhdr;
+ int ret = 0;
+ int retries = 2;
+
+ if (GENL_REQ_ATTR_CHECK(info, DRM_ATTR_REQUEST))
+ return -EINVAL;
+
+ do {
+ new_msg = xe_genl_alloc_msg(xe, info, msg_size, &usrhdr);
+ if (!new_msg)
+ return -ENOMEM;
+
+ ret = fill_error_details(info, new_msg);
+ if (!ret)
+ break;
+
+ msg_size += NLMSG_DEFAULT_SIZE;
+ } while (retries--);
+
+ if (!ret)
+ ret = xe_genl_send(new_msg, info, usrhdr);
+
+ return ret;
+}
+
+static int xe_genl_read_error(struct sk_buff *msg, struct genl_info *info)
+{
+ struct xe_device *xe = genl_to_xe(info);
+ size_t msg_size = NLMSG_DEFAULT_SIZE;
+ struct sk_buff *new_msg;
+ void *usrhdr;
+ int ret = 0;
+ int retries = 2;
+ u64 config, val;
+
+ if (GENL_REQ_ATTR_CHECK(info, DRM_ATTR_ERROR_ID))
+ return -EINVAL;
+
+ config = nla_get_u64(info->attrs[DRM_ATTR_ERROR_ID]);
+ ret = config_status(xe, config);
+ if (ret)
+ return ret;
+ do {
+ new_msg = xe_genl_alloc_msg(xe, info, msg_size, &usrhdr);
+ if (!new_msg)
+ return -ENOMEM;
+
+ val = get_counter_value(xe, config);
+ if (nla_put_u64_64bit(new_msg, DRM_ATTR_ERROR_VALUE, val, DRM_ATTR_PAD)) {
+ msg_size += NLMSG_DEFAULT_SIZE;
+ continue;
+ }
+
+ break;
+ } while (retries--);
+
+ ret = xe_genl_send(new_msg, info, usrhdr);
+
+ return ret;
}
/* operations definition */
@@ -65,6 +497,9 @@ int xe_genl_register(struct xe_device *xe)
{
int ret;
+ BUILD_BUG_ON(ARRAY_SIZE(xe_hw_error_events) !=
+ ARRAY_SIZE(xe_hw_error_map));
+
xe_genl_family_init(xe);
ret = genl_register_family(&xe->xe_genl_family);
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index b0b80aae3ee8..a2ea238096df 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -801,6 +801,70 @@ struct drm_xe_vm_madvise {
__u64 reserved[2];
};
+/*
+ * HW error IDs
+ */
+
+#define __XE_GT_SHIFT (60)
+
+#define XE_HW_ERROR(gt, id) \
+ ((id) | ((__u64)(gt) << __XE_GT_SHIFT))
+
+#define XE_GT_ERROR_CORRECTABLE_L3_SNG (0)
+#define XE_GT_ERROR_CORRECTABLE_GUC (1)
+#define XE_GT_ERROR_CORRECTABLE_SAMPLER (2)
+#define XE_GT_ERROR_CORRECTABLE_SLM (3)
+#define XE_GT_ERROR_CORRECTABLE_EU_IC (4)
+#define XE_GT_ERROR_CORRECTABLE_EU_GRF (5)
+#define XE_GT_ERROR_FATAL_ARR_BIST (6)
+#define XE_GT_ERROR_FATAL_L3_DOUB (7)
+#define XE_GT_ERROR_FATAL_L3_ECC_CHK (8)
+#define XE_GT_ERROR_FATAL_GUC (9)
+#define XE_GT_ERROR_FATAL_IDI_PAR (10)
+#define XE_GT_ERROR_FATAL_SQIDI (11)
+#define XE_GT_ERROR_FATAL_SAMPLER (12)
+#define XE_GT_ERROR_FATAL_SLM (13)
+#define XE_GT_ERROR_FATAL_EU_IC (14)
+#define XE_GT_ERROR_FATAL_EU_GRF (15)
+#define XE_GT_ERROR_FATAL_FPU (16)
+#define XE_GT_ERROR_FATAL_TLB (17)
+#define XE_GT_ERROR_FATAL_L3_FABRIC (18)
+#define XE_GT_ERROR_CORRECTABLE_SUBSLICE (19)
+#define XE_GT_ERROR_CORRECTABLE_L3BANK (20)
+#define XE_GT_ERROR_FATAL_SUBSLICE (21)
+#define XE_GT_ERROR_FATAL_L3BANK (22)
+#define XE_SGUNIT_ERROR_CORRECTABLE (23)
+#define XE_SGUNIT_ERROR_NONFATAL (24)
+#define XE_SGUNIT_ERROR_FATAL (25)
+#define XE_SOC_ERROR_FATAL_PSF_CSC_0 (26)
+#define XE_SOC_ERROR_FATAL_PSF_CSC_1 (27)
+#define XE_SOC_ERROR_FATAL_PSF_CSC_2 (28)
+#define XE_SOC_ERROR_FATAL_PUNIT (29)
+#define XE_PVC_SOC_ERROR_FATAL_PSF_0 (30)
+#define XE_PVC_SOC_ERROR_FATAL_PSF_1 (31)
+#define XE_PVC_SOC_ERROR_FATAL_PSF_2 (32)
+#define XE_PVC_SOC_ERROR_FATAL_CD0 (33)
+#define XE_PVC_SOC_ERROR_FATAL_CD0_MDFI (34)
+#define XE_PVC_SOC_ERROR_FATAL_MDFI_EAST (35)
+#define XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH (36)
+
+#define XE_PVC_SOC_ERROR_FATAL_HBM(ss, n)\
+ (XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH + 0x1 + (ss) * 0x10 + (n))
+
+/* 68 is the last ID used by SOC errors */
+#define XE_GSC_ERROR_CORRECTABLE_SRAM_ECC (69)
+#define XE_GSC_ERROR_NONFATAL_MIA_SHUTDOWN (70)
+#define XE_GSC_ERROR_NONFATAL_MIA_INT (71)
+#define XE_GSC_ERROR_NONFATAL_SRAM_ECC (72)
+#define XE_GSC_ERROR_NONFATAL_WDG_TIMEOUT (73)
+#define XE_GSC_ERROR_NONFATAL_ROM_PARITY (74)
+#define XE_GSC_ERROR_NONFATAL_UCODE_PARITY (75)
+#define XE_GSC_ERROR_NONFATAL_GLITCH_DET (76)
+#define XE_GSC_ERROR_NONFATAL_FUSE_PULL (77)
+#define XE_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK (78)
+#define XE_GSC_ERROR_NONFATAL_FUSE_SELFMBIST (79)
+#define XE_GSC_ERROR_NONFATAL_AON_PARITY (80)
+
#if defined(__cplusplus)
}
#endif
--
2.25.1
More information about the Intel-xe
mailing list