<!DOCTYPE html><html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 10-08-2023 15:01, Ghimiray, Himal
      Prasad wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:ac574fd9-dce4-4f1e-a209-f4a400263273@intel.com">
      
      <p><br>
      </p>
      <div class="moz-cite-prefix">On 10-08-2023 13:24, Jani Nikula
        wrote:<br>
      </div>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <pre class="moz-quote-pre" wrap="">On Thu, 10 Aug 2023, Himal Prasad Ghimiray <a class="moz-txt-link-rfc2396E" href="mailto:himal.prasad.ghimiray@intel.com" moz-do-not-send="true"><himal.prasad.ghimiray@intel.com></a> wrote:
</pre>
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">The GFX device can generate numbers of classes of error under the new
infrastructure: correctable, non-fatal, and fatal errors.

The non-fatal and fatal error classes distinguish between levels of
severity for uncorrectable errors. Driver will only handle logging
of errors and updating counters from various components within the
graphics device. Anything more will be handled at system level.

For errors that will route as interrupts, three bits in the Master
Interrupt Register will be used to convey the class of error.

For each class of error: Determine source of error (IP block) by reading
the Device Error Source Register (RW1C) that
corresponds to the class of error being serviced.

Bspec: 50875, 53073, 53074, 53075

Cc: Rodrigo Vivi <a class="moz-txt-link-rfc2396E" href="mailto:rodrigo.vivi@intel.com" moz-do-not-send="true"><rodrigo.vivi@intel.com></a>
Cc: Aravind Iddamsetty <a class="moz-txt-link-rfc2396E" href="mailto:aravind.iddamsetty@intel.com" moz-do-not-send="true"><aravind.iddamsetty@intel.com></a>
Cc: Matthew Brost <a class="moz-txt-link-rfc2396E" href="mailto:matthew.brost@intel.com" moz-do-not-send="true"><matthew.brost@intel.com></a>
Cc: Matt Roper <a class="moz-txt-link-rfc2396E" href="mailto:matthew.d.roper@intel.com" moz-do-not-send="true"><matthew.d.roper@intel.com></a>
Cc: Joonas Lahtinen <a class="moz-txt-link-rfc2396E" href="mailto:joonas.lahtinen@linux.intel.com" moz-do-not-send="true"><joonas.lahtinen@linux.intel.com></a>
Cc: Jani Nikula <a class="moz-txt-link-rfc2396E" href="mailto:jani.nikula@intel.com" moz-do-not-send="true"><jani.nikula@intel.com></a>
Signed-off-by: Himal Prasad Ghimiray <a class="moz-txt-link-rfc2396E" href="mailto:himal.prasad.ghimiray@intel.com" moz-do-not-send="true"><himal.prasad.ghimiray@intel.com></a>
---
 drivers/gpu/drm/xe/regs/xe_regs.h            |   7 +
 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h | 108 +++++++++
 drivers/gpu/drm/xe/xe_device_types.h         |   6 +
 drivers/gpu/drm/xe/xe_irq.c                  | 220 +++++++++++++++++++
 4 files changed, 341 insertions(+)
 create mode 100644 drivers/gpu/drm/xe/regs/xe_tile_error_regs.h

diff --git a/drivers/gpu/drm/xe/regs/xe_regs.h b/drivers/gpu/drm/xe/regs/xe_regs.h
index ec45b1ba9db1..9901e55fc89c 100644
--- a/drivers/gpu/drm/xe/regs/xe_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_regs.h
@@ -87,7 +87,14 @@
 #define   GU_MISC_IRQ                          REG_BIT(29)
 #define   DISPLAY_IRQ                          REG_BIT(16)
 #define   GT_DW_IRQ(x)                         REG_BIT(x)
+#define   XE_ERROR_IRQ(x)                      REG_BIT(26 + (x))
 
 #define PVC_RP_STATE_CAP                       XE_REG(0x281014)
 
+enum hardware_error {
+       HARDWARE_ERROR_CORRECTABLE = 0,
+       HARDWARE_ERROR_NONFATAL = 1,
+       HARDWARE_ERROR_FATAL = 2,
+       HARDWARE_ERROR_MAX,
+};
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">This file is about registers. IMO enums belong somewhere else. Define
hardware registers using macros.</pre>
      </blockquote>
      <p>Hmm. Will look for better placeholder for this enum.<br>
      </p>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap=""> #endif
diff --git a/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
new file mode 100644
index 000000000000..fbb794b2f183
--- /dev/null
+++ b/drivers/gpu/drm/xe/regs/xe_tile_error_regs.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2023 Intel Corporation
+ */
+#ifndef XE_TILE_ERROR_REGS_H_
+#define XE_TILE_ERROR_REGS_H_
+
+#include <linux/stddef.h>
+
+#define _DEV_ERR_STAT_NONFATAL                         0x100178
+#define _DEV_ERR_STAT_CORRECTABLE                      0x10017c
+#define DEV_ERR_STAT_REG(x)                            XE_REG(_PICK_EVEN((x), \
+                                                               _DEV_ERR_STAT_CORRECTABLE, \
+                                                               _DEV_ERR_STAT_NONFATAL))
+
+#define  DEV_ERR_STAT_MAX_ERROR_BIT         (21)
+
+/* Count of  Correctable and Uncorrectable errors reported on tile */
+enum xe_tile_hw_errors {g
+       XE_TILE_HW_ERR_GT_FATAL = 0,
+       XE_TILE_HW_ERR_SGGI_FATAL,
+       XE_TILE_HW_ERR_DISPLAY_FATAL,
+       XE_TILE_HW_ERR_SGDI_FATAL,
+       XE_TILE_HW_ERR_SGLI_FATAL,
+       XE_TILE_HW_ERR_SGUNIT_FATAL,
+       XE_TILE_HW_ERR_SGCI_FATAL,
+       XE_TILE_HW_ERR_GSC_FATAL,
+       XE_TILE_HW_ERR_SOC_FATAL,
+       XE_TILE_HW_ERR_MERT_FATAL,
+       XE_TILE_HW_ERR_SGMI_FATAL,
+       XE_TILE_HW_ERR_UNKNOWN_FATAL,
+       XE_TILE_HW_ERR_SGGI_NONFATAL,
+       XE_TILE_HW_ERR_DISPLAY_NONFATAL,
+       XE_TILE_HW_ERR_SGDI_NONFATAL,
+       XE_TILE_HW_ERR_SGLI_NONFATAL,
+       XE_TILE_HW_ERR_GT_NONFATAL,
+       XE_TILE_HW_ERR_SGUNIT_NONFATAL,
+       XE_TILE_HW_ERR_SGCI_NONFATAL,
+       XE_TILE_HW_ERR_GSC_NONFATAL,
+       XE_TILE_HW_ERR_SOC_NONFATAL,
+       XE_TILE_HW_ERR_MERT_NONFATAL,
+       XE_TILE_HW_ERR_SGMI_NONFATAL,
+       XE_TILE_HW_ERR_UNKNOWN_NONFATAL,
+       XE_TILE_HW_ERR_GT_CORR,
+       XE_TILE_HW_ERR_DISPLAY_CORR,
+       XE_TILE_HW_ERR_SGUNIT_CORR,
+       XE_TILE_HW_ERR_GSC_CORR,
+       XE_TILE_HW_ERR_SOC_CORR,
+       XE_TILE_HW_ERR_UNKNOWN_CORR,
+};
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Ditto about enums and regs.</pre>
      </blockquote>
      Will address.<br>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+
+#define XE_TILE_HW_ERROR_MAX (XE_TILE_HW_ERR_UNKNOWN_CORR + 1)
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">If it's an enum, adding that last in the enum does the trick.</pre>
      </blockquote>
      Makes sense.<br>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+
+#define PVC_DEV_ERR_STAT_FATAL_MASK \
+               (REG_BIT(0) | \
+                REG_BIT(1) | \
+                REG_BIT(8) | \
+                REG_BIT(9) | \
+                REG_BIT(13) | \
+                REG_BIT(16) | \
+                REG_BIT(20))
+
+#define PVC_DEV_ERR_STAT_NONFATAL_MASK \
+               (REG_BIT(0) | \
+                REG_BIT(1) | \
+                REG_BIT(8) | \
+                REG_BIT(9) | \
+                REG_BIT(13) | \
+                REG_BIT(16) | \
+                REG_BIT(20))
+
+#define PVC_DEV_ERR_STAT_CORRECTABLE_MASK \
+               (REG_BIT(0) | \
+                REG_BIT(8))
+
+#define DG2_DEV_ERR_STAT_FATAL_MASK \
+               (REG_BIT(0) | \
+                REG_BIT(4) | \
+                REG_BIT(8) | \
+                REG_BIT(12) | \
+                REG_BIT(16))
+
+#define DG2_DEV_ERR_STAT_NONFATAL_MASK \
+               (REG_BIT(0) | \
+                REG_BIT(4) | \
+                REG_BIT(8) | \
+                REG_BIT(12) | \
+                REG_BIT(16) | \
+                REG_BIT(20))
+
+#define DG2_DEV_ERR_STAT_CORRECTABLE_MASK \
+               (REG_BIT(0) | \
+                REG_BIT(4) | \
+                REG_BIT(8) | \
+                REG_BIT(12) | \
+                REG_BIT(16))
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Are the above supposed to match what's in xe_tile_hw_errors? Seems
rather unmaintainable.
</pre>
      </blockquote>
      <pre class="moz-quote-pre" wrap="">xe_tile_hw_errors contains superset of applicable platforms and mask determines
what are applicable bits for a platform.
</pre>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+
+#define REG_SIZE 32
+
+#define xe_tile_log_hw_err(tile, fmt, ...) \
+       drm_err_ratelimited(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
+                           tile->id, ##__VA_ARGS__)
+
+#define xe_tile_log_hw_warn(tile, fmt, ...) \
+       drm_warn(&tile_to_xe(tile)->drm, HW_ERR "TILE%d detected " fmt, \
+                tile->id, ##__VA_ARGS__)
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Do we really want to keep adding new macros for all possible scenarios
in the driver? This is getting out of hand.

Where's HW_ERR defined?</pre>
      </blockquote>
      include/linux/printk.h defines HW_ERR.<br>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index f84ecb976f5d..1335ba74981a 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -16,6 +16,7 @@
 #include "xe_gt_types.h"
 #include "xe_platform_types.h"
 #include "xe_step_types.h"
+#include "regs/xe_tile_error_regs.h"
 
 #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
 #include "ext/intel_device_info.h"
@@ -166,6 +167,11 @@ struct xe_tile {
 
        /** @sysfs: sysfs' kobj used by xe_tile_sysfs */
        struct kobject *sysfs;
+
+       /** @tile_hw_errors: hardware errors reported for the tile */
+       struct tile_hw_errors {
+               unsigned long hw[XE_TILE_HW_ERROR_MAX];
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Even with the documentation comment, I have to look up the source code
to realize this is the *number* of errors for each class.

Maybe "count" is more informative than "hw".</pre>
      </blockquote>
      Ok.<br>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+        } errors;
 };
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
index 2022a5643e01..04a665faea23 100644
--- a/drivers/gpu/drm/xe/xe_irq.c
+++ b/drivers/gpu/drm/xe/xe_irq.c
@@ -362,6 +362,223 @@ static void dg1_intr_enable(struct xe_device *xe, bool stall)
                xe_mmio_read32(mmio, DG1_MSTR_TILE_INTR);
 }
 
+static const char *
+hardware_error_type_to_str(const enum hardware_error hw_err)
+{
+       switch (hw_err) {
+       case HARDWARE_ERROR_CORRECTABLE:
+               return "CORRECTABLE";
+       case HARDWARE_ERROR_NONFATAL:
+               return "NONFATAL";
+       case HARDWARE_ERROR_FATAL:
+               return "FATAL";
+       default:
+               return "UNKNOWN";
+       }
+}
+
+struct error_msg_counter_pair {
+       const char *errmsg;
+       int errcounter;
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Counter? Or type/class/whatever?

</pre>
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+};
+
+struct error_msg_counter_pair dev_err_stat_fatal_reg[] = {
+       {"GT",                        XE_TILE_HW_ERR_GT_FATAL                 /* Bit Pos 0 */},
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Does this again tie the enums and the bit positions together, similar to
how the mask macros also do above?

There needs to be a single point of truth for all of this.

I think this needs a redesign.</pre>
      </blockquote>
      <p>As commented above this is array which defines all valid bit
        positions irrespective of platform. Mask was <br>
      </p>
      <p>to determine platform specific applicability.<br>
      </p>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <pre class="moz-quote-pre" wrap="">BR,
Jani.

</pre>
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+        {"SGGI Cmd Parity",   XE_TILE_HW_ERR_SGGI_FATAL               /* Bit Pos 1 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"DISPLAY",           XE_TILE_HW_ERR_DISPLAY_FATAL            /* Bit Pos 4 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"GSC error",         XE_TILE_HW_ERR_GSC_FATAL                /* Bit Pos 8 */},
+       {"SGLI Cmd Parity",   XE_TILE_HW_ERR_SGLI_FATAL               /* Bit Pos 9 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"SGUNIT",            XE_TILE_HW_ERR_SGUNIT_FATAL             /* Bit Pos 12 */},
+       {"SGCI Cmd Parity",   XE_TILE_HW_ERR_SGCI_FATAL               /* Bit Pos 13 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"SOC ERROR",         XE_TILE_HW_ERR_SOC_FATAL                /* Bit Pos 16 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_FATAL},
+       {"MERT Cmd Parity",   XE_TILE_HW_ERR_MERT_FATAL               /* Bit Pos 20 */},
+};
+
+struct error_msg_counter_pair dev_err_stat_nonfatal_reg[] = {
+       {"GT",                        XE_TILE_HW_ERR_GT_NONFATAL              /* Bit Pos 0 */},
+       {"SGGI Data Parity",  XE_TILE_HW_ERR_SGGI_NONFATAL            /* Bit Pos 1 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"DISPLAY",           XE_TILE_HW_ERR_DISPLAY_NONFATAL         /* Bit Pos 4 */},
+       {"SGDI Data Parity",  XE_TILE_HW_ERR_SGDI_NONFATAL            /* Bit Pos 5 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"GSC",                       XE_TILE_HW_ERR_GSC_NONFATAL             /* Bit Pos 8 */},
+       {"SGLI Data Parity",  XE_TILE_HW_ERR_SGLI_NONFATAL            /* Bit Pos 9 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"SGUNIT",            XE_TILE_HW_ERR_SGUNIT_NONFATAL          /* Bit Pos 12 */},
+       {"SGCI Data Parity",  XE_TILE_HW_ERR_SGCI_NONFATAL            /* Bit Pos 13 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"SOC",                       XE_TILE_HW_ERR_SOC_NONFATAL             /* Bit Pos 16 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL         /* Bit Pos 17 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_NONFATAL},
+       {"MERT Data Parity",  XE_TILE_HW_ERR_MERT_NONFATAL            /* Bit Pos 20 */},
+};
+
+struct error_msg_counter_pair dev_err_stat_correctable_reg[] = {
+       {"GT",                        XE_TILE_HW_ERR_GT_CORR                  /* Bit Pos 0 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"DISPLAY",           XE_TILE_HW_ERR_DISPLAY_CORR             /* Bit Pos 4 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"GSC",                       XE_TILE_HW_ERR_GSC_CORR                 /* Bit Pos 8 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"SGUNIT",            XE_TILE_HW_ERR_SGUNIT_CORR              /* Bit Pos 12 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"SOC",                       XE_TILE_HW_ERR_SOC_CORR                 /* Bit Pos 16 */},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+       {"Undefined",         XE_TILE_HW_ERR_UNKNOWN_CORR},
+};
+
+static void update_valid_error_regs(struct xe_device *xe)
+{
+       unsigned long mask = 0;
+
+       u32 i;
+
+       if (xe->info.platform == XE_DG2) {
+               mask = ~(0 | DG2_DEV_ERR_STAT_FATAL_MASK);
+               for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
+                       dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
+                       {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Nope. For one thing, the arrays really should be static const, placed in
rodata, and not mutable.</pre>
      </blockquote>
      My Idea of keeping it mutable is to avoid multiple platform
      specific arrays.<br>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <pre class="moz-quote-pre" wrap="">For another, if you have a platform with two or more different devices,
whichever gets probed last clobbers the data.</pre>
      </blockquote>
      <p>Thanks for pointing it out. I had missed this particular
        scenario.  I believe defining platform specific array is better</p>
      <p>because then we can ensure them to be immutable and we will not
        be required to have platform specific mask also. <br>
      </p>
      <p>I believe this will help with maintainability too.  <br>
      </p>
      <p>BR</p>
      <p>Himal Ghimiray <span style="white-space: pre-wrap">
</span><span style="white-space: pre-wrap">
</span></p>
      <blockquote type="cite" cite="mid:877cq3s567.fsf@intel.com">
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+
+               mask = ~(0 | DG2_DEV_ERR_STAT_NONFATAL_MASK);
+               for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
+                       dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
+                       {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
+
+               mask = ~(0 | DG2_DEV_ERR_STAT_CORRECTABLE_MASK);
+               for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
+                       dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
+                       {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
+       } else if (xe->info.platform == XE_PVC) {
+               mask = ~(0 | PVC_DEV_ERR_STAT_FATAL_MASK);
+               for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
+                       dev_err_stat_fatal_reg[i] = (struct error_msg_counter_pair)
+                       {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_FATAL};
+
+               mask = ~(0 | PVC_DEV_ERR_STAT_NONFATAL_MASK);
+               for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
+                       dev_err_stat_nonfatal_reg[i] = (struct error_msg_counter_pair)
+                       {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL};
+
+               mask = ~(0 | PVC_DEV_ERR_STAT_CORRECTABLE_MASK);
+               for_each_set_bit(i, &mask, DEV_ERR_STAT_MAX_ERROR_BIT)
+                       dev_err_stat_correctable_reg[i] = (struct error_msg_counter_pair)
+                       {.errmsg = "Undefined", .errcounter = XE_TILE_HW_ERR_UNKNOWN_CORR};
+       }
+}
+
+static void
+xe_hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err)
+{
+       const char *hw_err_str = hardware_error_type_to_str(hw_err);
+       struct error_msg_counter_pair *errstat;
+       unsigned long errsrc;
+       unsigned long flags;
+       const char *errmsg;
+       struct xe_gt *mmio;
+       u32 counter;
+       u32 errcntr;
+       u32 errbit;
+
+       switch (hw_err) {
+       case HARDWARE_ERROR_FATAL:
+               errstat = (struct error_msg_counter_pair *)dev_err_stat_fatal_reg;
</pre>
        </blockquote>
        <pre class="moz-quote-pre" wrap="">Why the casts?

</pre>
        <blockquote type="cite">
          <pre class="moz-quote-pre" wrap="">+                counter = XE_TILE_HW_ERR_UNKNOWN_FATAL;
+               break;
+       case HARDWARE_ERROR_NONFATAL:
+               errstat = (struct error_msg_counter_pair *)dev_err_stat_nonfatal_reg;
+               counter = XE_TILE_HW_ERR_UNKNOWN_NONFATAL;
+               break;
+       case HARDWARE_ERROR_CORRECTABLE:
+               errstat = (struct error_msg_counter_pair *)dev_err_stat_correctable_reg;
+               counter = XE_TILE_HW_ERR_UNKNOWN_CORR;
+               break;
+       default:
+               return;
+       }
+
+       spin_lock_irqsave(&tile_to_xe(tile)->irq.lock, flags);
+       mmio = tile->primary_gt;
+       errsrc = xe_mmio_read32(mmio, DEV_ERR_STAT_REG(hw_err));
+
+       if (!errsrc) {
+               xe_tile_log_hw_err(tile, "DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
+               goto unlock;
+       }
+
+       for_each_set_bit(errbit, &errsrc, REG_SIZE) {
+               if (errbit < DEV_ERR_STAT_MAX_ERROR_BIT) {
+                       errmsg = errstat[errbit].errmsg;
+                       errcntr = errstat[errbit].errcounter;
+               } else {
+                       errmsg = "Undefined";
+                       errcntr = counter;
+               }
+
+               if (hw_err == HARDWARE_ERROR_CORRECTABLE)
+                       xe_tile_log_hw_warn(tile, "%s %s error bit[%d] is set\n",
+                                           errmsg, hw_err_str, errbit);
+               else
+                       xe_tile_log_hw_err(tile, "%s %s error bit[%d] is set\n",
+                                          errmsg, hw_err_str, errbit);
+
+               tile->errors.hw[errcntr]++;
+       }
+
+       xe_mmio_write32(mmio, DEV_ERR_STAT_REG(hw_err), errsrc);
+unlock:
+       spin_unlock_irqrestore(&tile_to_xe(tile)->irq.lock, flags);
+}
+
+/*
+ * XE Platforms adds three Error bits to the Master Interrupt
+ * Register to support error handling. These three bits are
+ * used to convey the class of error:
+ * FATAL, NONFATAL, or CORRECTABLE.
+ *
+ * To process an interrupt:
+ *       Determine source of error (IP block) by reading
+ *      the Device Error Source Register (RW1C) that
+ *      corresponds to the class of error being serviced
+ *      and log the error.
+ */
+static void
+xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
+{
+       enum hardware_error hw_err;
+
+       for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
+               if (master_ctl & XE_ERROR_IRQ(hw_err))
+                       xe_hw_error_source_handler(tile, hw_err);
+       }
+}
+
 /*
  * Top-level interrupt handler for Xe_LP+ and beyond.  These platforms have
  * a "master tile" interrupt register which must be consulted before the
@@ -413,6 +630,7 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
                xe_mmio_write32(mmio, GFX_MSTR_IRQ, master_ctl);
 
                gt_irq_handler(tile, master_ctl, intr_dw, identity);
+               xe_hw_error_irq_handler(tile, master_ctl);
 
                /*
                 * Display interrupts (including display backlight operations
@@ -561,6 +779,8 @@ int xe_irq_install(struct xe_device *xe)
                return -EINVAL;
        }
 
+       update_valid_error_regs(xe);
+
        xe->irq.enabled = true;
 
        xe_irq_reset(xe);
</pre>
        </blockquote>
      </blockquote>
    </blockquote>
  </body>
</html>