amd-gfx Digest, Vol 98, Issue 217

Prosyak, Vitaly Vitaly.Prosyak at amd.com
Thu Aug 8 17:00:55 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Acked-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
________________________________
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> on behalf of amd-gfx-request at lists.freedesktop.org <amd-gfx-request at lists.freedesktop.org>
Sent: Wednesday, July 17, 2024 4:40 PM
To: amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>
Subject: amd-gfx Digest, Vol 98, Issue 217

Send amd-gfx mailing list submissions to
        amd-gfx at lists.freedesktop.org

To subscribe or unsubscribe via the World Wide Web, visit
        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
or, via email, send a message with subject or body 'help' to
        amd-gfx-request at lists.freedesktop.org

You can reach the person managing the list at
        amd-gfx-owner at lists.freedesktop.org

When replying, please edit your Subject line so it is more specific
than "Re: Contents of amd-gfx digest..."


Today's Topics:

   1. [PATCH 1/6] drm/amdgpu/gfx: add bad opcode interrupt
      (Alex Deucher)
   2. [PATCH 5/6] drm/amdgpu/gfx9: Enable bad opcode interrupt
      (Alex Deucher)
   3. [PATCH 3/6] drm/amdgpu/gfx10: Enable bad opcode interrupt
      (Alex Deucher)


----------------------------------------------------------------------

Message: 1
Date: Wed, 17 Jul 2024 16:40:06 -0400
From: Alex Deucher <alexander.deucher at amd.com>
To: <amd-gfx at lists.freedesktop.org>
Cc: Alex Deucher <alexander.deucher at amd.com>
Subject: [PATCH 1/6] drm/amdgpu/gfx: add bad opcode interrupt
Message-ID: <20240717204011.15342-1-alexander.deucher at amd.com>
Content-Type: text/plain

Add the irq source for bad opcodes.

Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index ddda94e49db4..86d3fa7eef90 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -391,6 +391,7 @@ struct amdgpu_gfx {
         struct amdgpu_irq_src           eop_irq;
         struct amdgpu_irq_src           priv_reg_irq;
         struct amdgpu_irq_src           priv_inst_irq;
+       struct amdgpu_irq_src           bad_op_irq;
         struct amdgpu_irq_src           cp_ecc_error_irq;
         struct amdgpu_irq_src           sq_irq;
         struct amdgpu_irq_src           rlc_gc_fed_irq;
--
2.45.2



------------------------------

Message: 2
Date: Wed, 17 Jul 2024 16:40:10 -0400
From: Alex Deucher <alexander.deucher at amd.com>
To: <amd-gfx at lists.freedesktop.org>
Cc: Alex Deucher <alexander.deucher at amd.com>
Subject: [PATCH 5/6] drm/amdgpu/gfx9: Enable bad opcode interrupt
Message-ID: <20240717204011.15342-5-alexander.deucher at amd.com>
Content-Type: text/plain

For the bad opcode case, it will cause CP/ME hang.
The firmware will prevent the ME side from hanging by raising a bad opcode interrupt.
And the driver needs to perform a vmid reset when receiving the interrupt.

Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 65 +++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 97476fb2ca40..675a1a8e2515 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2182,6 +2182,13 @@ static int gfx_v9_0_sw_init(void *handle)
         if (r)
                 return r;

+       /* Bad opcode Event */
+       r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP,
+                             GFX_9_0__SRCID__CP_BAD_OPCODE_ERROR,
+                             &adev->gfx.bad_op_irq);
+       if (r)
+               return r;
+
         /* Privileged reg */
         r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_PRIV_REG_FAULT,
                               &adev->gfx.priv_reg_irq);
@@ -3937,6 +3944,7 @@ static int gfx_v9_0_hw_fini(void *handle)
                 amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
         amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
         amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
+       amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);

         /* DF freeze and kcq disable will fail */
         if (!amdgpu_ras_intr_triggered())
@@ -4747,6 +4755,10 @@ static int gfx_v9_0_late_init(void *handle)
         if (r)
                 return r;

+       r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0);
+       if (r)
+               return r;
+
         r = gfx_v9_0_ecc_late_init(handle);
         if (r)
                 return r;
@@ -5990,6 +6002,42 @@ static int gfx_v9_0_set_priv_reg_fault_state(struct amdgpu_device *adev,
         return 0;
 }

+static int gfx_v9_0_set_bad_op_fault_state(struct amdgpu_device *adev,
+                                          struct amdgpu_irq_src *source,
+                                          unsigned type,
+                                          enum amdgpu_interrupt_state state)
+{
+       u32 cp_int_cntl_reg, cp_int_cntl;
+       int i, j;
+
+       switch (state) {
+       case AMDGPU_IRQ_STATE_DISABLE:
+       case AMDGPU_IRQ_STATE_ENABLE:
+               WREG32_FIELD15(GC, 0, CP_INT_CNTL_RING0,
+                              OPCODE_ERROR_INT_ENABLE,
+                              state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+               for (i = 0; i < adev->gfx.mec.num_mec; i++) {
+                       for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) {
+                               /* MECs start at 1 */
+                               cp_int_cntl_reg = gfx_v9_0_get_cpc_int_cntl(adev, i + 1, j);
+
+                               if (cp_int_cntl_reg) {
+                                       cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg);
+                                       cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL,
+                                                                   OPCODE_ERROR_INT_ENABLE,
+                                                                   state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+                                       WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl);
+                               }
+                       }
+               }
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
 static int gfx_v9_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
                                               struct amdgpu_irq_src *source,
                                               unsigned type,
@@ -6163,6 +6211,15 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
         return 0;
 }

+static int gfx_v9_0_bad_op_irq(struct amdgpu_device *adev,
+                              struct amdgpu_irq_src *source,
+                              struct amdgpu_iv_entry *entry)
+{
+       DRM_ERROR("Illegal opcode in command stream\n");
+       gfx_v9_0_fault(adev, entry);
+       return 0;
+}
+
 static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
                                   struct amdgpu_irq_src *source,
                                   struct amdgpu_iv_entry *entry)
@@ -7346,6 +7403,11 @@ static const struct amdgpu_irq_src_funcs gfx_v9_0_priv_reg_irq_funcs = {
         .process = gfx_v9_0_priv_reg_irq,
 };

+static const struct amdgpu_irq_src_funcs gfx_v9_0_bad_op_irq_funcs = {
+       .set = gfx_v9_0_set_bad_op_fault_state,
+       .process = gfx_v9_0_bad_op_irq,
+};
+
 static const struct amdgpu_irq_src_funcs gfx_v9_0_priv_inst_irq_funcs = {
         .set = gfx_v9_0_set_priv_inst_fault_state,
         .process = gfx_v9_0_priv_inst_irq,
@@ -7365,6 +7427,9 @@ static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev)
         adev->gfx.priv_reg_irq.num_types = 1;
         adev->gfx.priv_reg_irq.funcs = &gfx_v9_0_priv_reg_irq_funcs;

+       adev->gfx.bad_op_irq.num_types = 1;
+       adev->gfx.bad_op_irq.funcs = &gfx_v9_0_bad_op_irq_funcs;
+
         adev->gfx.priv_inst_irq.num_types = 1;
         adev->gfx.priv_inst_irq.funcs = &gfx_v9_0_priv_inst_irq_funcs;

--
2.45.2



------------------------------

Message: 3
Date: Wed, 17 Jul 2024 16:40:08 -0400
From: Alex Deucher <alexander.deucher at amd.com>
To: <amd-gfx at lists.freedesktop.org>
Cc: Jesse Zhang <jesse.zhang at amd.com>, Alex Deucher
        <alexander.deucher at amd.com>
Subject: [PATCH 3/6] drm/amdgpu/gfx10: Enable bad opcode interrupt
Message-ID: <20240717204011.15342-3-alexander.deucher at amd.com>
Content-Type: text/plain

From: Jesse Zhang <jesse.zhang at amd.com>

For the bad opcode case, it will cause CP/ME hang.
The firmware will prevent the ME side from hanging by raising a bad opcode interrupt.
And the driver needs to perform a vmid reset when receiving the interrupt.

v2: update irq naming (drop priv) (Alex)

Signed-off-by: Jesse Zhang <jesse.zhang at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 74 ++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 66d80f3dc661..4ce13a4f7a20 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4740,6 +4740,13 @@ static int gfx_v10_0_sw_init(void *handle)
         if (r)
                 return r;

+       /* Bad opcode Event */
+       r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP,
+                             GFX_10_1__SRCID__CP_BAD_OPCODE_ERROR,
+                             &adev->gfx.bad_op_irq);
+       if (r)
+               return r;
+
         /* Privileged reg */
         r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_10_1__SRCID__CP_PRIV_REG_FAULT,
                               &adev->gfx.priv_reg_irq);
@@ -7416,6 +7423,7 @@ static int gfx_v10_0_hw_fini(void *handle)

         amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
         amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
+       amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0);

         /* WA added for Vangogh asic fixing the SMU suspend failure
          * It needs to set power gating again during gfxoff control
@@ -7726,6 +7734,10 @@ static int gfx_v10_0_late_init(void *handle)
         if (r)
                 return r;

+       r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0);
+       if (r)
+               return r;
+
         return 0;
 }

@@ -9162,6 +9174,51 @@ static int gfx_v10_0_set_priv_reg_fault_state(struct amdgpu_device *adev,
         return 0;
 }

+static int gfx_v10_0_set_bad_op_fault_state(struct amdgpu_device *adev,
+                                           struct amdgpu_irq_src *source,
+                                           unsigned type,
+                                           enum amdgpu_interrupt_state state)
+{
+       u32 cp_int_cntl_reg, cp_int_cntl;
+       int i , j;
+
+       switch (state) {
+       case AMDGPU_IRQ_STATE_DISABLE:
+       case AMDGPU_IRQ_STATE_ENABLE:
+               for (i = 0; i < adev->gfx.me.num_me; i++) {
+                       for (j = 0; j < adev->gfx.me.num_pipe_per_me; j++) {
+                               cp_int_cntl_reg = gfx_v10_0_get_cpg_int_cntl(adev, i, j);
+
+                               if (cp_int_cntl_reg) {
+                                       cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg);
+                                       cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_INT_CNTL_RING0,
+                                                                   OPCODE_ERROR_INT_ENABLE,
+                                                                   state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+                                       WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl);
+                               }
+                       }
+               }
+               for (i = 0; i < adev->gfx.mec.num_mec; i++) {
+                       for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) {
+                               /* MECs start at 1 */
+                               cp_int_cntl_reg = gfx_v10_0_get_cpc_int_cntl(adev, i + 1, j);
+
+                               if (cp_int_cntl_reg) {
+                                       cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg);
+                                       cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL,
+                                                                   OPCODE_ERROR_INT_ENABLE,
+                                                                   state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
+                                       WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl);
+                               }
+                       }
+               }
+               break;
+       default:
+               break;
+       }
+       return 0;
+}
+
 static int gfx_v10_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
                                                struct amdgpu_irq_src *source,
                                                unsigned int type,
@@ -9237,6 +9294,15 @@ static int gfx_v10_0_priv_reg_irq(struct amdgpu_device *adev,
         return 0;
 }

+static int gfx_v10_0_bad_op_irq(struct amdgpu_device *adev,
+                               struct amdgpu_irq_src *source,
+                               struct amdgpu_iv_entry *entry)
+{
+       DRM_ERROR("Illegal opcode in command stream \n");
+       gfx_v10_0_handle_priv_fault(adev, entry);
+       return 0;
+}
+
 static int gfx_v10_0_priv_inst_irq(struct amdgpu_device *adev,
                                    struct amdgpu_irq_src *source,
                                    struct amdgpu_iv_entry *entry)
@@ -9624,6 +9690,11 @@ static const struct amdgpu_irq_src_funcs gfx_v10_0_priv_reg_irq_funcs = {
         .process = gfx_v10_0_priv_reg_irq,
 };

+static const struct amdgpu_irq_src_funcs gfx_v10_0_bad_op_irq_funcs = {
+       .set = gfx_v10_0_set_bad_op_fault_state,
+       .process = gfx_v10_0_bad_op_irq,
+};
+
 static const struct amdgpu_irq_src_funcs gfx_v10_0_priv_inst_irq_funcs = {
         .set = gfx_v10_0_set_priv_inst_fault_state,
         .process = gfx_v10_0_priv_inst_irq,
@@ -9645,6 +9716,9 @@ static void gfx_v10_0_set_irq_funcs(struct amdgpu_device *adev)
         adev->gfx.priv_reg_irq.num_types = 1;
         adev->gfx.priv_reg_irq.funcs = &gfx_v10_0_priv_reg_irq_funcs;

+       adev->gfx.bad_op_irq.num_types = 1;
+       adev->gfx.bad_op_irq.funcs = &gfx_v10_0_bad_op_irq_funcs;
+
         adev->gfx.priv_inst_irq.num_types = 1;
         adev->gfx.priv_inst_irq.funcs = &gfx_v10_0_priv_inst_irq_funcs;
 }
--
2.45.2



------------------------------

Subject: Digest Footer

_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


------------------------------

End of amd-gfx Digest, Vol 98, Issue 217
****************************************
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240808/0c207f3e/attachment-0001.htm>


More information about the amd-gfx mailing list