[Intel-xe] [RFC 3/5] drm/xe/RAS: Expose the error counters

Aravind Iddamsetty aravind.iddamsetty at intel.com
Fri May 26 16:20:14 UTC 2023


We expose the various error counters supported on a hardware via genl
subsystem through the registered commands to userspace.
The DRM_CMD_QUERY lists the error names with config id, DRM_CMD_READ_ONE
returns the counter value for the requested config id and the
DRM_CMD_READ_ALL list the counters for all errors along with their names
and config ids.

Signed-off-by: Aravind Iddamsetty <aravind.iddamsetty at intel.com>
---
 drivers/gpu/drm/xe/xe_netlink.c | 439 +++++++++++++++++++++++++++++++-
 include/uapi/drm/xe_drm.h       |  64 +++++
 2 files changed, 501 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_netlink.c b/drivers/gpu/drm/xe/xe_netlink.c
index 63ef238ebc27..2a6965f5cde9 100644
--- a/drivers/gpu/drm/xe/xe_netlink.c
+++ b/drivers/gpu/drm/xe/xe_netlink.c
@@ -4,19 +4,451 @@
  */
 
 #include <drm/drm_managed.h>
+#include <drm/xe_drm.h>
 
 #include "xe_device.h"
 
+#define MAX_ERROR_NAME	50
+
+#define HAS_GT_ERROR_VECTORS(xe)	((xe)->info.has_gt_error_vectors)
+#define HAS_MEM_SPARING_SUPPORT(xe)	((xe)->info.has_mem_sparing)
+
 DEFINE_XARRAY(xe_xarray);
 
-static int xe_genl_list_errors(struct sk_buff *msg, struct genl_info *info)
+static const char * const xe_hw_error_events[] = {
+		[XE_GT_ERROR_CORRECTABLE_L3_SNG] = "correctable-l3-sng",
+		[XE_GT_ERROR_CORRECTABLE_GUC] = "correctable-guc",
+		[XE_GT_ERROR_CORRECTABLE_SAMPLER] = "correctable-sampler",
+		[XE_GT_ERROR_CORRECTABLE_SLM] = "correctable-slm",
+		[XE_GT_ERROR_CORRECTABLE_EU_IC] = "correctable-eu-ic",
+		[XE_GT_ERROR_CORRECTABLE_EU_GRF] = "correctable-eu-grf",
+		[XE_GT_ERROR_FATAL_ARR_BIST] = "fatal-array-bist",
+		[XE_GT_ERROR_FATAL_L3_DOUB] = "fatal-l3-double",
+		[XE_GT_ERROR_FATAL_L3_ECC_CHK] = "fatal-l3-ecc-checker",
+		[XE_GT_ERROR_FATAL_GUC] = "fatal-guc",
+		[XE_GT_ERROR_FATAL_IDI_PAR] = "fatal-idi-parity",
+		[XE_GT_ERROR_FATAL_SQIDI] = "fatal-sqidi",
+		[XE_GT_ERROR_FATAL_SAMPLER] = "fatal-sampler",
+		[XE_GT_ERROR_FATAL_SLM] = "fatal-slm",
+		[XE_GT_ERROR_FATAL_EU_IC] = "fatal-eu-ic",
+		[XE_GT_ERROR_FATAL_EU_GRF] = "fatal-eu-grf",
+		[XE_GT_ERROR_FATAL_FPU] = "fatal-fpu",
+		[XE_GT_ERROR_FATAL_TLB] = "fatal-tlb",
+		[XE_GT_ERROR_FATAL_L3_FABRIC] = "fatal-l3-fabric",
+		[XE_GT_ERROR_CORRECTABLE_SUBSLICE] = "correctable-subslice",
+		[XE_GT_ERROR_CORRECTABLE_L3BANK] = "correctable-l3bank",
+		[XE_GT_ERROR_FATAL_SUBSLICE] = "fatal-subslice",
+		[XE_GT_ERROR_FATAL_L3BANK] = "fatal-l3bank",
+		[XE_SGUNIT_ERROR_CORRECTABLE] = "sgunit-correctable",
+		[XE_SGUNIT_ERROR_NONFATAL] = "sgunit-nonfatal",
+		[XE_SGUNIT_ERROR_FATAL] = "sgunit-fatal",
+		[XE_SOC_ERROR_FATAL_PSF_CSC_0] = "soc-fatal-psf-csc-0",
+		[XE_SOC_ERROR_FATAL_PSF_CSC_1] = "soc-fatal-psf-csc-1",
+		[XE_SOC_ERROR_FATAL_PSF_CSC_2] = "soc-fatal-psf-csc-2",
+		[XE_SOC_ERROR_FATAL_PUNIT] = "soc-fatal-punit",
+		[XE_PVC_SOC_ERROR_FATAL_PSF_0] = "soc-fatal-psf-0",
+		[XE_PVC_SOC_ERROR_FATAL_PSF_1] = "soc-fatal-psf-1",
+		[XE_PVC_SOC_ERROR_FATAL_PSF_2] = "soc-fatal-psf-2",
+		[XE_PVC_SOC_ERROR_FATAL_CD0] = "soc-fatal-cd0",
+		[XE_PVC_SOC_ERROR_FATAL_CD0_MDFI] = "soc-fatal-cd0-mdfi",
+		[XE_PVC_SOC_ERROR_FATAL_MDFI_EAST] = "soc-fatal-mdfi-east",
+		[XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH] = "soc-fatal-mdfi-south",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 0)] = "soc-fatal-hbm-ss0-0",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 1)] = "soc-fatal-hbm-ss0-1",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 2)] = "soc-fatal-hbm-ss0-2",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 3)] = "soc-fatal-hbm-ss0-3",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 4)] = "soc-fatal-hbm-ss0-4",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 5)] = "soc-fatal-hbm-ss0-5",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 6)] = "soc-fatal-hbm-ss0-6",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 7)] = "soc-fatal-hbm-ss0-7",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 8)] = "soc-fatal-hbm-ss1-0",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 9)] = "soc-fatal-hbm-ss1-1",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 10)] = "soc-fatal-hbm-ss1-2",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 11)] = "soc-fatal-hbm-ss1-3",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 12)] = "soc-fatal-hbm-ss1-4",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 13)] = "soc-fatal-hbm-ss1-5",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 14)] = "soc-fatal-hbm-ss1-6",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(0, 15)] = "soc-fatal-hbm-ss1-7",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 0)] = "soc-fatal-hbm-ss2-0",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 1)] = "soc-fatal-hbm-ss2-1",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 2)] = "soc-fatal-hbm-ss2-2",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 3)] = "soc-fatal-hbm-ss2-3",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 4)] = "soc-fatal-hbm-ss2-4",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 5)] = "soc-fatal-hbm-ss2-5",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 6)] = "soc-fatal-hbm-ss2-6",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 7)] = "soc-fatal-hbm-ss2-7",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 8)] = "soc-fatal-hbm-ss3-0",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 9)] = "soc-fatal-hbm-ss3-1",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 10)] = "soc-fatal-hbm-ss3-2",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 11)] = "soc-fatal-hbm-ss3-3",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 12)] = "soc-fatal-hbm-ss3-4",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 13)] = "soc-fatal-hbm-ss3-5",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 14)] = "soc-fatal-hbm-ss3-6",
+		[XE_PVC_SOC_ERROR_FATAL_HBM(1, 15)] = "soc-fatal-hbm-ss3-7",
+		[XE_GSC_ERROR_CORRECTABLE_SRAM_ECC] = "gsc-correctable-sram-ecc",
+		[XE_GSC_ERROR_NONFATAL_MIA_SHUTDOWN] = "gsc-nonfatal-mia-shutdown",
+		[XE_GSC_ERROR_NONFATAL_MIA_INT] = "gsc-nonfatal-mia-int",
+		[XE_GSC_ERROR_NONFATAL_SRAM_ECC] = "gsc-nonfatal-sram-ecc",
+		[XE_GSC_ERROR_NONFATAL_WDG_TIMEOUT] = "gsc-nonfatal-wdg-timeout",
+		[XE_GSC_ERROR_NONFATAL_ROM_PARITY] = "gsc-nonfatal-rom-parity",
+		[XE_GSC_ERROR_NONFATAL_UCODE_PARITY] = "gsc-nonfatal-ucode-parity",
+		[XE_GSC_ERROR_NONFATAL_GLITCH_DET] = "gsc-nonfatal-glitch-det",
+		[XE_GSC_ERROR_NONFATAL_FUSE_PULL] = "gsc-nonfatal-fuse-pull",
+		[XE_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK] = "gsc-nonfatal-fuse-crc-check",
+		[XE_GSC_ERROR_NONFATAL_FUSE_SELFMBIST] = "gsc-nonfatal-selfmbist",
+		[XE_GSC_ERROR_NONFATAL_AON_PARITY] = "gsc-nonfatal-aon-parity",
+};
+
+static const unsigned long xe_hw_error_map[] = {
+	[XE_GT_ERROR_CORRECTABLE_L3_SNG] = INTEL_GT_HW_ERROR_COR_L3_SNG,
+	[XE_GT_ERROR_CORRECTABLE_GUC] = INTEL_GT_HW_ERROR_COR_GUC,
+	[XE_GT_ERROR_CORRECTABLE_SAMPLER] = INTEL_GT_HW_ERROR_COR_SAMPLER,
+	[XE_GT_ERROR_CORRECTABLE_SLM] = INTEL_GT_HW_ERROR_COR_SLM,
+	[XE_GT_ERROR_CORRECTABLE_EU_IC] = INTEL_GT_HW_ERROR_COR_EU_IC,
+	[XE_GT_ERROR_CORRECTABLE_EU_GRF] = INTEL_GT_HW_ERROR_COR_EU_GRF,
+	[XE_GT_ERROR_FATAL_ARR_BIST] = INTEL_GT_HW_ERROR_FAT_ARR_BIST,
+	[XE_GT_ERROR_FATAL_L3_DOUB] = INTEL_GT_HW_ERROR_FAT_L3_DOUB,
+	[XE_GT_ERROR_FATAL_L3_ECC_CHK] = INTEL_GT_HW_ERROR_FAT_L3_ECC_CHK,
+	[XE_GT_ERROR_FATAL_GUC] = INTEL_GT_HW_ERROR_FAT_GUC,
+	[XE_GT_ERROR_FATAL_IDI_PAR] = INTEL_GT_HW_ERROR_FAT_IDI_PAR,
+	[XE_GT_ERROR_FATAL_SQIDI] = INTEL_GT_HW_ERROR_FAT_SQIDI,
+	[XE_GT_ERROR_FATAL_SAMPLER] = INTEL_GT_HW_ERROR_FAT_SAMPLER,
+	[XE_GT_ERROR_FATAL_SLM] = INTEL_GT_HW_ERROR_FAT_SLM,
+	[XE_GT_ERROR_FATAL_EU_IC] = INTEL_GT_HW_ERROR_FAT_EU_IC,
+	[XE_GT_ERROR_FATAL_EU_GRF] = INTEL_GT_HW_ERROR_FAT_EU_GRF,
+	[XE_GT_ERROR_FATAL_FPU] = INTEL_GT_HW_ERROR_FAT_FPU,
+	[XE_GT_ERROR_FATAL_TLB] = INTEL_GT_HW_ERROR_FAT_TLB,
+	[XE_GT_ERROR_FATAL_L3_FABRIC] = INTEL_GT_HW_ERROR_FAT_L3_FABRIC,
+	[XE_GT_ERROR_CORRECTABLE_SUBSLICE] = INTEL_GT_HW_ERROR_COR_SUBSLICE,
+	[XE_GT_ERROR_CORRECTABLE_L3BANK] = INTEL_GT_HW_ERROR_COR_L3BANK,
+	[XE_GT_ERROR_FATAL_SUBSLICE] = INTEL_GT_HW_ERROR_FAT_SUBSLICE,
+	[XE_GT_ERROR_FATAL_L3BANK] = INTEL_GT_HW_ERROR_FAT_L3BANK,
+	[XE_SGUNIT_ERROR_CORRECTABLE] = HARDWARE_ERROR_CORRECTABLE,
+	[XE_SGUNIT_ERROR_NONFATAL] = HARDWARE_ERROR_NONFATAL,
+	[XE_SGUNIT_ERROR_FATAL] = HARDWARE_ERROR_FATAL,
+	[XE_SOC_ERROR_FATAL_PSF_CSC_0] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_0),
+	[XE_SOC_ERROR_FATAL_PSF_CSC_1] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_1),
+	[XE_SOC_ERROR_FATAL_PSF_CSC_2] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, SOC_PSF_CSC_2),
+	[XE_SOC_ERROR_FATAL_PUNIT] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_PUNIT),
+	[XE_PVC_SOC_ERROR_FATAL_PSF_0] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_0),
+	[XE_PVC_SOC_ERROR_FATAL_PSF_1] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_1),
+	[XE_PVC_SOC_ERROR_FATAL_PSF_2] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_PSF_2),
+	[XE_PVC_SOC_ERROR_FATAL_CD0] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_CD0),
+	[XE_PVC_SOC_ERROR_FATAL_CD0_MDFI] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_CD0_MDFI),
+	[XE_PVC_SOC_ERROR_FATAL_MDFI_EAST] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_MDFI_EAST),
+	[XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_LOCAL, HARDWARE_ERROR_FATAL, PVC_SOC_MDFI_SOUTH),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 0)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_0),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 1)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_1),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 2)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_2),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 3)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_3),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 4)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_4),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 5)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_5),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 6)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_6),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 7)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, SOC_HBM_SS0_7),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 8)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_0),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 9)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_1),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 10)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_2),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 11)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_3),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 12)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_4),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 13)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_5),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 14)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_6),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(0, 15)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH0, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS1_7),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 0)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_0),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 1)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_1),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 2)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_2),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 3)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_3),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 4)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_4),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 5)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_5),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 6)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_6),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 7)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS2_7),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 8)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_0),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 9)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_1),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 10)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_2),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 11)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_3),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 12)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_4),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 13)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_5),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 14)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_6),
+	[XE_PVC_SOC_ERROR_FATAL_HBM(1, 15)] = SOC_ERR_INDEX(INTEL_GT_SOC_IEH1, INTEL_SOC_REG_GLOBAL, HARDWARE_ERROR_FATAL, PVC_SOC_HBM_SS3_7),
+	[XE_GSC_ERROR_CORRECTABLE_SRAM_ECC] = INTEL_GSC_HW_ERROR_COR_SRAM_ECC,
+	[XE_GSC_ERROR_NONFATAL_MIA_SHUTDOWN] = INTEL_GSC_HW_ERROR_UNCOR_MIA_SHUTDOWN,
+	[XE_GSC_ERROR_NONFATAL_MIA_INT] = INTEL_GSC_HW_ERROR_UNCOR_MIA_INT,
+	[XE_GSC_ERROR_NONFATAL_SRAM_ECC] = INTEL_GSC_HW_ERROR_UNCOR_SRAM_ECC,
+	[XE_GSC_ERROR_NONFATAL_WDG_TIMEOUT] = INTEL_GSC_HW_ERROR_UNCOR_WDG_TIMEOUT,
+	[XE_GSC_ERROR_NONFATAL_ROM_PARITY] = INTEL_GSC_HW_ERROR_UNCOR_ROM_PARITY,
+	[XE_GSC_ERROR_NONFATAL_UCODE_PARITY] = INTEL_GSC_HW_ERROR_UNCOR_UCODE_PARITY,
+	[XE_GSC_ERROR_NONFATAL_GLITCH_DET] = INTEL_GSC_HW_ERROR_UNCOR_GLITCH_DET,
+	[XE_GSC_ERROR_NONFATAL_FUSE_PULL] = INTEL_GSC_HW_ERROR_UNCOR_FUSE_PULL,
+	[XE_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK] = INTEL_GSC_HW_ERROR_UNCOR_FUSE_CRC_CHECK,
+	[XE_GSC_ERROR_NONFATAL_FUSE_SELFMBIST] = INTEL_GSC_HW_ERROR_UNCOR_SELFMBIST,
+	[XE_GSC_ERROR_NONFATAL_AON_PARITY] = INTEL_GSC_HW_ERROR_UNCOR_AON_PARITY,
+};
+
+static unsigned int config_gt_id(const u64 config)
+{
+	return config >> __XE_GT_SHIFT;
+}
+
+static u64 config_counter(const u64 config)
+{
+	return config & ~(~0ULL << __XE_GT_SHIFT);
+}
+
+static bool is_gt_vector_error(const u64 config)
 {
+	unsigned int error;
+
+	error = config_counter(config);
+	if (error >= XE_GT_ERROR_FATAL_TLB &&
+	    error <= XE_GT_ERROR_FATAL_L3BANK)
+		return true;
+
+	return false;
+}
+
+static bool is_pvc_invalid_gt_errors(const u64 config)
+{
+	switch (config_counter(config)) {
+	case XE_GT_ERROR_CORRECTABLE_L3_SNG:
+	case XE_GT_ERROR_CORRECTABLE_SAMPLER:
+	case XE_GT_ERROR_FATAL_ARR_BIST:
+	case XE_GT_ERROR_FATAL_L3_DOUB:
+	case XE_GT_ERROR_FATAL_L3_ECC_CHK:
+	case XE_GT_ERROR_FATAL_IDI_PAR:
+	case XE_GT_ERROR_FATAL_SQIDI:
+	case XE_GT_ERROR_FATAL_SAMPLER:
+	case XE_GT_ERROR_FATAL_EU_IC:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool is_gsc_hw_error(const u64 config)
+{
+	if (config_counter(config) >= XE_GSC_ERROR_CORRECTABLE_SRAM_ECC &&
+	    config_counter(config) <= XE_GSC_ERROR_NONFATAL_AON_PARITY)
+		return true;
+
+	return false;
+}
+
+static bool is_soc_error(const u64 config)
+{
+	if (config_counter(config) >= XE_SOC_ERROR_FATAL_PSF_CSC_0 &&
+	    config_counter(config) <= XE_PVC_SOC_ERROR_FATAL_HBM(1, 15))
+		return true;
+
+	return false;
+}
+
+static int
+config_status(struct xe_device *xe, u64 config)
+{
+	unsigned int gt_id = config_gt_id(config);
+
+	if (!IS_DGFX(xe))
+		return -ENODEV;
+
+	if (xe->gt[gt_id].info.type == XE_GT_TYPE_UNINITIALIZED)
+		return -ENOENT;
+
+	/* GSC HW ERRORS are present on root tile of
+	 * platform supporting MEMORY SPARING only
+	 */
+	if (is_gsc_hw_error(config) && !(HAS_MEM_SPARING_SUPPORT(xe) && gt_id == 0))
+		return -ENODEV;
+
+	/* GT vectors error  are valid on Platforms supporting error vectors only */
+	if (is_gt_vector_error(config) && !HAS_GT_ERROR_VECTORS(xe))
+		return -ENODEV;
+
+	/* Skip gt errors not supported on pvc */
+	if (is_pvc_invalid_gt_errors(config) && (xe->info.platform == XE_PVC))
+		return  -ENODEV;
+
+	/* FATAL FPU error is valid on PVC only */
+	if (config_counter(config) == XE_GT_ERROR_FATAL_FPU &&
+	    !(xe->info.platform == XE_PVC))
+		return -ENODEV;
+
+	if (is_soc_error(config) && !(xe->info.platform == XE_PVC))
+		return -ENODEV;
+
+	return (config_counter(config) >=
+			ARRAY_SIZE(xe_hw_error_map)) ? -ENOENT : 0;
+}
+
+static u64 get_counter_value(struct xe_device *xe, u64 config)
+{
+	const unsigned int gt_id = config_gt_id(config);
+	unsigned int id = config_counter(config);
+
+	if (is_soc_error(config))
+		return xa_to_value(xa_load(&xe->gt[gt_id].errors.soc, xe_hw_error_map[id]));
+	else if (is_gsc_hw_error(config))
+		return xe->gt[gt_id].errors.gsc_hw[xe_hw_error_map[id]];
+	else if (id >= XE_SGUNIT_ERROR_CORRECTABLE &&
+		 id <= XE_SGUNIT_ERROR_FATAL)
+		return xe->gt[gt_id].errors.sgunit[xe_hw_error_map[id]];
+	else
+		return xe->gt[gt_id].errors.hw[xe_hw_error_map[id]];
+
 	return 0;
 }
 
-static int xe_genl_read_error(struct sk_buff *msg, struct genl_info *info)
+static struct xe_device *genl_to_xe(struct genl_info *info)
+{
+	return xa_load(&xe_xarray, info->nlhdr->nlmsg_type);
+}
+
+static int xe_genl_send(struct sk_buff *msg, struct genl_info *info, void *usrhdr)
 {
+	int ret;
+
+	genlmsg_end(msg, usrhdr);
+
+	ret = genlmsg_reply(msg, info);
+	if (ret)
+		nlmsg_free(msg);
+
+	return ret;
+}
+
+static struct sk_buff *
+xe_genl_alloc_msg(struct xe_device *xe,
+		  struct genl_info *info,
+		  size_t msg_size, void **usrhdr)
+{
+	struct sk_buff *new_msg;
+
+	new_msg = genlmsg_new(msg_size, GFP_KERNEL);
+	if (!new_msg)
+		return new_msg;
+
+	*usrhdr = genlmsg_put_reply(new_msg, info, &xe->xe_genl_family, 0, info->genlhdr->cmd);
+	if (!*usrhdr) {
+		nlmsg_free(new_msg);
+		new_msg = NULL;
+	}
+
+	return new_msg;
+}
+
+int fill_error_details(struct genl_info *info, struct sk_buff *new_msg)
+{
+	struct xe_device *xe = genl_to_xe(info);
+	struct nlattr *entry_attr;
+	struct xe_gt *gt;
+	int i, j;
+	bool counter = false;
+
+	if (info->genlhdr->cmd == DRM_CMD_READ_ALL)
+		counter = true;
+
+	entry_attr = nla_nest_start(new_msg, DRM_ATTR_QUERY_REPLY);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	for_each_gt(gt, xe, j) {
+		char str[MAX_ERROR_NAME];
+		u64 val;
+
+		for (i = 0; i < ARRAY_SIZE(xe_hw_error_events); i++) {
+			u64 config = XE_HW_ERROR(j, i);
+
+			if (config_status(xe, config))
+				continue;
+
+			/* should this be cleared everytime */
+			snprintf(str, sizeof(str), "error-gt%d-%s", j, xe_hw_error_events[i]);
+
+			if (nla_put_string(new_msg, DRM_ATTR_ERROR_NAME, str))
+				goto err;
+			if (nla_put_u64_64bit(new_msg, DRM_ATTR_ERROR_ID, config, DRM_ATTR_PAD))
+				goto err;
+			if (counter) {
+				val = get_counter_value(xe, config);
+				if (nla_put_u64_64bit(new_msg, DRM_ATTR_ERROR_VALUE, val, DRM_ATTR_PAD))
+					goto err;
+			}
+		}
+	}
+
+	nla_nest_end(new_msg, entry_attr);
+
 	return 0;
+err:
+	drm_dbg_driver(&xe->drm, "msg buff is small\n");
+	nla_nest_cancel(new_msg, entry_attr);
+	nlmsg_free(new_msg);
+
+	return -EMSGSIZE;
+}
+
+static int xe_genl_list_errors(struct sk_buff *msg, struct genl_info *info)
+{
+	struct xe_device *xe = genl_to_xe(info);
+	size_t msg_size = NLMSG_DEFAULT_SIZE;
+	struct sk_buff *new_msg;
+	void *usrhdr;
+	int ret = 0;
+	int retries = 2;
+
+	if (GENL_REQ_ATTR_CHECK(info, DRM_ATTR_REQUEST))
+		return -EINVAL;
+
+	do {
+		new_msg = xe_genl_alloc_msg(xe, info, msg_size, &usrhdr);
+		if (!new_msg)
+			return -ENOMEM;
+
+		ret = fill_error_details(info, new_msg);
+		if (!ret)
+			break;
+
+		msg_size += NLMSG_DEFAULT_SIZE;
+	} while (retries--);
+
+	if (!ret)
+		ret = xe_genl_send(new_msg, info, usrhdr);
+
+	return ret;
+}
+
+static int xe_genl_read_error(struct sk_buff *msg, struct genl_info *info)
+{
+	struct xe_device *xe = genl_to_xe(info);
+	size_t msg_size = NLMSG_DEFAULT_SIZE;
+	struct sk_buff *new_msg;
+	void *usrhdr;
+	int ret = 0;
+	int retries = 2;
+	u64 config, val;
+
+	if (GENL_REQ_ATTR_CHECK(info, DRM_ATTR_ERROR_ID))
+		return -EINVAL;
+
+	config = nla_get_u64(info->attrs[DRM_ATTR_ERROR_ID]);
+	ret = config_status(xe, config);
+	if (ret)
+		return ret;
+	do {
+		new_msg = xe_genl_alloc_msg(xe, info, msg_size, &usrhdr);
+		if (!new_msg)
+			return -ENOMEM;
+
+		val = get_counter_value(xe, config);
+		if (nla_put_u64_64bit(new_msg, DRM_ATTR_ERROR_VALUE, val, DRM_ATTR_PAD)) {
+			msg_size += NLMSG_DEFAULT_SIZE;
+			continue;
+		}
+
+		break;
+	} while (retries--);
+
+	ret = xe_genl_send(new_msg, info, usrhdr);
+
+	return ret;
 }
 
 /* operations definition */
@@ -65,6 +497,9 @@ int xe_genl_register(struct xe_device *xe)
 {
 	int ret;
 
+	BUILD_BUG_ON(ARRAY_SIZE(xe_hw_error_events) !=
+		     ARRAY_SIZE(xe_hw_error_map));
+
 	xe_genl_family_init(xe);
 
 	ret = genl_register_family(&xe->xe_genl_family);
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index b0b80aae3ee8..a2ea238096df 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -801,6 +801,70 @@ struct drm_xe_vm_madvise {
 	__u64 reserved[2];
 };
 
+/*
+ * HW error IDs
+ */
+
+#define __XE_GT_SHIFT	(60)
+
+#define XE_HW_ERROR(gt, id) \
+	((id) | ((__u64)(gt) << __XE_GT_SHIFT))
+
+#define XE_GT_ERROR_CORRECTABLE_L3_SNG		(0)
+#define XE_GT_ERROR_CORRECTABLE_GUC		(1)
+#define XE_GT_ERROR_CORRECTABLE_SAMPLER		(2)
+#define XE_GT_ERROR_CORRECTABLE_SLM		(3)
+#define XE_GT_ERROR_CORRECTABLE_EU_IC		(4)
+#define XE_GT_ERROR_CORRECTABLE_EU_GRF		(5)
+#define XE_GT_ERROR_FATAL_ARR_BIST		(6)
+#define XE_GT_ERROR_FATAL_L3_DOUB		(7)
+#define XE_GT_ERROR_FATAL_L3_ECC_CHK		(8)
+#define XE_GT_ERROR_FATAL_GUC			(9)
+#define XE_GT_ERROR_FATAL_IDI_PAR		(10)
+#define XE_GT_ERROR_FATAL_SQIDI			(11)
+#define XE_GT_ERROR_FATAL_SAMPLER		(12)
+#define XE_GT_ERROR_FATAL_SLM			(13)
+#define XE_GT_ERROR_FATAL_EU_IC			(14)
+#define XE_GT_ERROR_FATAL_EU_GRF		(15)
+#define XE_GT_ERROR_FATAL_FPU			(16)
+#define XE_GT_ERROR_FATAL_TLB			(17)
+#define XE_GT_ERROR_FATAL_L3_FABRIC		(18)
+#define XE_GT_ERROR_CORRECTABLE_SUBSLICE	(19)
+#define XE_GT_ERROR_CORRECTABLE_L3BANK		(20)
+#define XE_GT_ERROR_FATAL_SUBSLICE		(21)
+#define XE_GT_ERROR_FATAL_L3BANK		(22)
+#define XE_SGUNIT_ERROR_CORRECTABLE		(23)
+#define XE_SGUNIT_ERROR_NONFATAL		(24)
+#define XE_SGUNIT_ERROR_FATAL			(25)
+#define XE_SOC_ERROR_FATAL_PSF_CSC_0		(26)
+#define XE_SOC_ERROR_FATAL_PSF_CSC_1		(27)
+#define XE_SOC_ERROR_FATAL_PSF_CSC_2		(28)
+#define XE_SOC_ERROR_FATAL_PUNIT		(29)
+#define XE_PVC_SOC_ERROR_FATAL_PSF_0		(30)
+#define XE_PVC_SOC_ERROR_FATAL_PSF_1		(31)
+#define XE_PVC_SOC_ERROR_FATAL_PSF_2		(32)
+#define XE_PVC_SOC_ERROR_FATAL_CD0		(33)
+#define XE_PVC_SOC_ERROR_FATAL_CD0_MDFI		(34)
+#define XE_PVC_SOC_ERROR_FATAL_MDFI_EAST	(35)
+#define XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH	(36)
+
+#define XE_PVC_SOC_ERROR_FATAL_HBM(ss, n)\
+		(XE_PVC_SOC_ERROR_FATAL_MDFI_SOUTH + 0x1 + (ss) * 0x10 + (n))
+
+/* 68 is the last ID used by SOC errors */
+#define XE_GSC_ERROR_CORRECTABLE_SRAM_ECC	(69)
+#define XE_GSC_ERROR_NONFATAL_MIA_SHUTDOWN	(70)
+#define XE_GSC_ERROR_NONFATAL_MIA_INT		(71)
+#define XE_GSC_ERROR_NONFATAL_SRAM_ECC		(72)
+#define XE_GSC_ERROR_NONFATAL_WDG_TIMEOUT	(73)
+#define XE_GSC_ERROR_NONFATAL_ROM_PARITY	(74)
+#define XE_GSC_ERROR_NONFATAL_UCODE_PARITY	(75)
+#define XE_GSC_ERROR_NONFATAL_GLITCH_DET	(76)
+#define XE_GSC_ERROR_NONFATAL_FUSE_PULL		(77)
+#define XE_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK	(78)
+#define XE_GSC_ERROR_NONFATAL_FUSE_SELFMBIST	(79)
+#define XE_GSC_ERROR_NONFATAL_AON_PARITY	(80)
+
 #if defined(__cplusplus)
 }
 #endif
-- 
2.25.1



More information about the Intel-xe mailing list