[PATCH 3/5] drm/amdgpu: refine poison consumption interrupt handler

YiPeng Chai YiPeng.Chai at amd.com
Tue Jun 18 06:33:57 UTC 2024


1. The poison fifo is only used for poison consumption
   requests.
2. Merge reset requests when poison fifo caches multiple
   poison consumption messages

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 +++++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 12 ++---
 2 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 13cd6a9234f2..898889600771 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2881,22 +2881,40 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
 }
 
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
-			struct ras_poison_msg *poison_msg)
+			uint32_t msg_count, uint32_t *gpu_reset)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	uint32_t reset = poison_msg->reset;
-	uint16_t pasid = poison_msg->pasid;
+	uint32_t reset_flags = 0, reset = 0;
+	struct ras_poison_msg msg;
+	int ret, i;
 
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 
-	if (poison_msg->pasid_fn)
-		poison_msg->pasid_fn(adev, pasid, poison_msg->data);
+	for (i = 0; i < msg_count; i++) {
+		ret = amdgpu_ras_get_poison_req(adev, &msg);
+		if (!ret)
+			continue;
+
+		if (msg.pasid_fn)
+			msg.pasid_fn(adev, msg.pasid, msg.data);
+
+		reset_flags |= msg.reset;
+	}
+
+	if (reset_flags) {
+		if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
+			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+		else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET)
+			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+		else
+			reset = reset_flags;
 
-	if (reset) {
 		flush_delayed_work(&con->page_retirement_dwork);
 
 		con->gpu_reset_flags |= reset;
 		amdgpu_ras_reset_gpu(adev);
+
+		*gpu_reset = reset;
 	}
 
 	return 0;
@@ -2906,11 +2924,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)param;
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	uint32_t poison_creation_count;
+	uint32_t poison_creation_count, msg_count;
+	uint32_t gpu_reset;
 	int ret;
-	struct ras_poison_msg poison_msg;
-	enum amdgpu_ras_block ras_block;
-	bool poison_creation_is_handled = false;
 
 	while (!kthread_should_stop()) {
 
@@ -2921,6 +2937,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 		if (kthread_should_stop())
 			break;
 
+		gpu_reset = 0;
 
 		do {
 			poison_creation_count = atomic_read(&con->poison_creation_count);
@@ -2937,16 +2954,19 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 		} while (atomic_read(&con->poison_creation_count));
 
 #ifdef HAVE_KFIFO_PUT_NON_POINTER
-		if (!amdgpu_ras_get_poison_req(adev, &poison_msg))
-			continue;
-
-		ras_block = poison_msg.block;
-
-		dev_dbg(adev->dev, "Start processing ras block %s(%d)\n",
-				ras_block_str(ras_block), ras_block);
-
+		if (ret != -EIO) {
+			msg_count = kfifo_len(&con->poison_fifo);
+			if (msg_count) {
+				ret = amdgpu_ras_poison_consumption_handler(adev,
+						msg_count, &gpu_reset);
+				if ((ret != -EIO) &&
+				    (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+					while (msg_count--)
+						atomic_dec(&con->page_retirement_req_cnt);
+				}
+			}
+		}
 
-			amdgpu_ras_poison_consumption_handler(adev, &poison_msg);
 #else
         dev_info(adev->dev, "Start processing page retirement. request:%d\n",
                     atomic_read(&con->page_retirement_req_cnt));
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..47a46bf49a06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -296,13 +296,15 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
 				struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
 #ifdef HAVE_KFIFO_PUT_NON_POINTER
-				amdgpu_ras_put_poison_req(adev,
+				int ret;
+
+				ret = amdgpu_ras_put_poison_req(adev,
 					block, pasid, pasid_fn, data, reset);
+				if (!ret) {
+					atomic_inc(&con->page_retirement_req_cnt);
+					wake_up(&con->page_retirement_wq);
+				}
 #endif
-
-				atomic_inc(&con->page_retirement_req_cnt);
-
-				wake_up(&con->page_retirement_wq);
 		}
 	} else {
 		if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
-- 
2.34.1



More information about the amd-gfx mailing list