[PATCH 2/4] drm/amdgpu: Add SDMA RAS poison consumption handling
Tao Zhou
tao.zhou1 at amd.com
Wed Jun 11 03:34:51 UTC 2025
Perform queue reset for SDMA poison consumption.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 8 ++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 30 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +++
4 files changed, 45 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6c47f7d9adcd..085bff11319a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -741,6 +741,14 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
}
+int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+ uint16_t node_id)
+{
+ return amdgpu_ras_poison_queue_reset(adev, block, client_id, vmid,
+ node_id);
+}
+
int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
uint32_t *payload)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3fa951ede37c..f1680027399e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -404,6 +404,9 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint16_t pasid,
pasid_notify pasid_fn, void *data, uint32_t reset);
+int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t client_id,
+ uint16_t vmid, uint16_t node_id);
bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6565dc7ff9cd..7e63c2fc1a62 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -5311,3 +5311,33 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
return con->is_rma;
}
+
+int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+ uint16_t node_id)
+{
+ struct amdgpu_ring *ring = NULL;
+ int sdma_inst, ret;
+
+ if (block == AMDGPU_RAS_BLOCK__SDMA &&
+ adev->sdma.instance[0].funcs->sdma_irq_id_to_seq) {
+ sdma_inst = adev->sdma.instance[0].funcs->sdma_irq_id_to_seq(adev,
+ client_id, node_id);
+ if (sdma_inst < 0)
+ return sdma_inst;
+
+ ring = &(adev->sdma.instance[sdma_inst].ring);
+ }
+
+ if (ring && ring->funcs->reset) {
+ ret = amdgpu_ring_reset(ring, vmid);
+ if (ret)
+ dev_warn(adev->dev,
+ "queue reset failed in block%d (ret %d), fallback to gpu reset\n",
+ block, ret);
+ } else {
+ return -EINVAL;
+ }
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..debc07767b5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -984,4 +984,8 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
const char *fmt, ...);
bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
+
+int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+ uint16_t node_id);
#endif
--
2.34.1
More information about the amd-gfx
mailing list