[PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)

Zhang, Hawking Hawking.Zhang at amd.com
Wed Mar 16 13:54:08 UTC 2022


[AMD Official Use Only]

V2 looks good to me

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1 at amd.com>
Sent: Wednesday, March 16, 2022 17:26
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Kuehling, Felix <Felix.Kuehling at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 3/4] drm/amdgpu: add UTCL2 RAS poison query for Aldebaran (v2)

Add help functions to query and reset RAS UTCL2 poison status.

v2: implement it on amdgpu side and kfd only calls it.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  8 ++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c    | 14 ++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6ca1db3c243f..c18c4be1e4ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -724,3 +724,11 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
        else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
+
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
+*adev) {
+       if (adev->gfx.ras->query_utcl2_poison_status)
+               return adev->gfx.ras->query_utcl2_poison_status(adev);
+       else
+               return false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 4cb14c2fe53f..0838926a8ef0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -301,6 +301,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,  bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p);  int amdgpu_amdkfd_criu_resume(void *p);
+bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device
+*adev);

 #if IS_ENABLED(CONFIG_HSA_AMD)
 void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index dcb3c7871c73..5ed9b8a4c571 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -202,6 +202,7 @@ struct amdgpu_cu_info {  struct amdgpu_gfx_ras {
        struct amdgpu_ras_block_object  ras_block;
        void (*enable_watchdog_timer)(struct amdgpu_device *adev);
+       bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
 };

 struct amdgpu_gfx_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
index 7653ebd0e67b..e0890e00eedf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c
@@ -1930,6 +1930,19 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)
        mutex_unlock(&adev->grbm_idx_mutex);
 }

+static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device
+*adev) {
+       uint32_t status = 0;
+       struct amdgpu_vmhub *hub;
+
+       hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+       status = RREG32(hub->vm_l2_pro_fault_status);
+       /* reset page fault status */
+       WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+
+       return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); }
+
 struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {
                .ras_error_inject = &gfx_v9_4_2_ras_error_inject,
                .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
@@ -1943,4 +1956,5 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {
                .hw_ops = &gfx_v9_4_2_ras_ops,
        },
        .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
+       .query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,
 };
--
2.35.1



More information about the amd-gfx mailing list