[PATCH 2/4] drm/amdgpu: Add SDMA RAS poison consumption handling

Tao Zhou tao.zhou1 at amd.com
Wed Jun 11 03:34:51 UTC 2025


Perform queue reset for SDMA poison consumption.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  8 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  4 +++
 4 files changed, 45 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 6c47f7d9adcd..085bff11319a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -741,6 +741,14 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
 	amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);
 }
 
+int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev,
+	enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+	uint16_t node_id)
+{
+	return amdgpu_ras_poison_queue_reset(adev, block, client_id, vmid,
+			node_id);
+}
+
 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
 					uint32_t *payload)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 3fa951ede37c..f1680027399e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -404,6 +404,9 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
 void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev,
 			enum amdgpu_ras_block block, uint16_t pasid,
 			pasid_notify pasid_fn, void *data, uint32_t reset);
+int amdgpu_amdkfd_ras_poison_queue_reset(struct amdgpu_device *adev,
+			enum amdgpu_ras_block block, uint16_t client_id,
+			uint16_t vmid, uint16_t node_id);
 
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);
 bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6565dc7ff9cd..7e63c2fc1a62 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -5311,3 +5311,33 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)
 
 	return con->is_rma;
 }
+
+int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev,
+	enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+	uint16_t node_id)
+{
+	struct amdgpu_ring *ring = NULL;
+	int sdma_inst, ret;
+
+	if (block == AMDGPU_RAS_BLOCK__SDMA &&
+	    adev->sdma.instance[0].funcs->sdma_irq_id_to_seq) {
+		sdma_inst = adev->sdma.instance[0].funcs->sdma_irq_id_to_seq(adev,
+					client_id, node_id);
+		if (sdma_inst < 0)
+			return sdma_inst;
+
+		ring = &(adev->sdma.instance[sdma_inst].ring);
+	}
+
+	if (ring && ring->funcs->reset) {
+		ret = amdgpu_ring_reset(ring, vmid);
+		if (ret)
+			dev_warn(adev->dev,
+				"queue reset failed in block%d (ret %d), fallback to gpu reset\n",
+				block, ret);
+	} else {
+		return -EINVAL;
+	}
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..debc07767b5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -984,4 +984,8 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
 				const char *fmt, ...);
 
 bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
+
+int amdgpu_ras_poison_queue_reset(struct amdgpu_device *adev,
+	enum amdgpu_ras_block block, uint16_t client_id, uint16_t vmid,
+	uint16_t node_id);
 #endif
-- 
2.34.1



More information about the amd-gfx mailing list