[PATCH V2 4/4] drm/amdgpu: add gpu reset check and exception handling

Thu Jun 20 05:40:20 UTC 2024

Add gpu reset check and exception handling for
page retirement.

v2:
  Clear poison consumption messages cached in fifo after
non mode-1 reset.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 52 +++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index a4030dc12a1c..ce7c7723e626 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1384,10 +1384,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i
 	memset(&qctx, 0, sizeof(qctx));
 	qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
 						   RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
+
+	if (!down_read_trylock(&adev->reset_domain->sem))
+		return -EIO;
+
 	ret = amdgpu_ras_query_error_status_helper(adev, info,
 						   &err_data,
 						   &qctx,
 						   error_query_mode);
+	up_read(&adev->reset_domain->sem);
 	if (ret)
 		goto out_fini_err_data;
 
@@ -2916,6 +2921,14 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
 	return 0;
 }
 
+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_poison_msg msg;
+
+	while (kfifo_get(&con->poison_fifo, &msg));
+}
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
 			uint32_t msg_count, uint32_t *gpu_reset)
 {
@@ -2946,12 +2959,20 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
 		else
 			reset = reset_flags;
 
+		/* If gpu reset is ongoing, not need to reset gpu again */
+		if (!down_read_trylock(&adev->reset_domain->sem))
+			return -EIO;
+		up_read(&adev->reset_domain->sem);
+
 		flush_delayed_work(&con->page_retirement_dwork);
 
 		con->gpu_reset_flags |= reset;
 		amdgpu_ras_reset_gpu(adev);
 
 		*gpu_reset = reset;
+
+		/* Wait for gpu recovery to complete */
+		flush_work(&con->recovery_work);
 	}
 
 	return 0;
@@ -3000,6 +3021,37 @@ static int amdgpu_ras_page_retirement_thread(void *param)
 			}
 		}
 
+		if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+			/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
+			/* Clear poison creation request */
+			atomic_set(&con->poison_creation_count, 0);
+
+			/* Clear poison fifo */
+			amdgpu_ras_clear_poison_fifo(adev);
+
+			/* Clear all poison requests*/
+			atomic_set(&con->page_retirement_req_cnt, 0);
+
+			if (ret == -EIO) {
+				/* Wait for mode-1 reset to complete */
+				down_read(&adev->reset_domain->sem);
+				up_read(&adev->reset_domain->sem);
+			}
+
+			/* Wake up work to save bad pages to eeprom */
+			schedule_delayed_work(&con->page_retirement_dwork, 0);
+		} else if (gpu_reset) {
+			/* gpu just completed mode-2 reset or other reset */
+			/* Clear poison consumption messages cached in fifo */
+			msg_count = kfifo_len(&con->poison_fifo);
+			if (msg_count) {
+				amdgpu_ras_clear_poison_fifo(adev);
+				atomic_sub(msg_count, &con->page_retirement_req_cnt);
+			}
+
+			/* Wake up work to save bad pages to eeprom */
+			schedule_delayed_work(&con->page_retirement_dwork, 0);
+		}
 #else
         dev_info(adev->dev, "Start processing page retirement. request:%d\n",
                     atomic_read(&con->page_retirement_req_cnt));
-- 
2.34.1