[PATCH 1/1] drm/amdgpu: Poll of RAS errors asynchronously

Thu May 6 00:10:38 UTC 2021

When using Vega 20 with RAS support and RAS is
enabled, the system interactivity is extremely
slow, to the point of being unusable. After
debugging, it was determined that this is due to
the polling loop performed for
AMDGPU_CTX_OP_QUERY_STATE2 under
amdgpu_ctx_ioctl(), which seems to be executed on
every ioctl from X/Mesa.

The latter seems to call amdgpu_ctx_query2() which
calls amdgpu_ras_query_error_count() twice, once
for each of ue_count and ce_count. This is
unnecessarily wasteful since
amdgpu_ras_query_error_count() calculates both,
but with the current interface it returns one or
the other, depending on its Boolean input, when it
can in fact return both, in a single invocation,
if it had a better interface.

Further down, the query_ras_error_count() callback
is called, which could be quite a large polling
loop, and very time consuming. For instance,
gfx_v9_0_query_ras_error_count() is at least
O(n^3). A similar situation is seen with
umc_v6_1_query_ras_error_count().

This commit implements asynchronous RAS error
count polling to that of the ioctl. A kernel
thread polls the RAS error counters once in a
while. The ioctl reads the values
asynchronously. The poll frequency is a module
parameter, with range [500, 10000] milliseconds,
default 3000.

Cc: Alexander Deucher <Alexander.Deucher at amd.com>
Cc: John Clements <john.clements at amd.com>
Cc: Hawking Zhang <Hawking.Zhang at amd.com>
Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 32 +++++++------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  9 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 61 +++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  8 ++--
 5 files changed, 85 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b292c1e3abc5..c29f1a24296b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -196,6 +196,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
 extern int amdgpu_ras_enable;
 extern uint amdgpu_ras_mask;
 extern int amdgpu_bad_page_threshold;
+extern uint amdgpu_ras_thread_poll_ms;
 extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
 extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 0350205c4897..6b2f29ae032c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -332,12 +332,12 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
 }
 
 static int amdgpu_ctx_query2(struct amdgpu_device *adev,
-	struct amdgpu_fpriv *fpriv, uint32_t id,
-	union drm_amdgpu_ctx_out *out)
+			     struct amdgpu_fpriv *fpriv, uint32_t id,
+			     union drm_amdgpu_ctx_out *out)
 {
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct amdgpu_ctx *ctx;
 	struct amdgpu_ctx_mgr *mgr;
-	unsigned long ras_counter;
 
 	if (!fpriv)
 		return -EINVAL;
@@ -362,19 +362,21 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 	if (atomic_read(&ctx->guilty))
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
-	/*query ue count*/
-	ras_counter = amdgpu_ras_query_error_count(adev, false);
-	/*ras counter is monotonic increasing*/
-	if (ras_counter != ctx->ras_counter_ue) {
-		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
-		ctx->ras_counter_ue = ras_counter;
-	}
+	if (con) {
+		int ce_count, ue_count;
+
+		ce_count = atomic_read(&con->ras_ce_count);
+		ue_count = atomic_read(&con->ras_ue_count);
 
-	/*query ce count*/
-	ras_counter = amdgpu_ras_query_error_count(adev, true);
-	if (ras_counter != ctx->ras_counter_ce) {
-		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
-		ctx->ras_counter_ce = ras_counter;
+		if (ce_count != ctx->ras_counter_ce) {
+			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
+			ctx->ras_counter_ce = ce_count;
+		}
+
+		if (ue_count != ctx->ras_counter_ue) {
+			out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
+			ctx->ras_counter_ue = ue_count;
+		}
 	}
 
 	mutex_unlock(&mgr->lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 27153ac384f1..44fc5827e6e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -183,6 +183,7 @@ struct amdgpu_mgpu_info mgpu_info = {
 int amdgpu_ras_enable = -1;
 uint amdgpu_ras_mask = 0xffffffff;
 int amdgpu_bad_page_threshold = -1;
+uint amdgpu_ras_thread_poll_ms = 3000;
 struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
 	.timeout_fatal_disable = false,
 	.period = 0x0, /* default to 0x0 (timeout disable) */
@@ -534,6 +535,14 @@ module_param_named(emu_mode, amdgpu_emu_mode, int, 0444);
 MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))");
 module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
 
+/**
+ * DOC: ras_thread_poll (uint)
+ * Number of milliseconds between RAS poll for errors. Valid range [500,10000].
+ * Default: 3000.
+ */
+MODULE_PARM_DESC(ras_thread_poll, "Number of milliseconds between RAS poll for errors. Valid range [500,10000]. Default: 3000.");
+module_param_named(ras_thread_poll, amdgpu_ras_thread_poll_ms, uint, 0444);
+
 /**
  * DOC: ras_mask (uint)
  * Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 609530c4a599..05cac55297d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1065,15 +1065,17 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 }
 
 /* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-		bool is_ce)
+static void amdgpu_ras_query_error_count(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_manager *obj;
-	struct ras_err_data data = {0, 0};
+	int ce_count, ue_count;
 
 	if (!adev->ras_enabled || !con)
-		return 0;
+		return;
+
+	ce_count = 0;
+	ue_count = 0;
 
 	list_for_each_entry(obj, &con->head, node) {
 		struct ras_query_if info = {
@@ -1081,13 +1083,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 		};
 
 		if (amdgpu_ras_query_error_status(adev, &info))
-			return 0;
+			return;
 
-		data.ce_count += info.ce_count;
-		data.ue_count += info.ue_count;
+		ce_count += info.ce_count;
+		ue_count += info.ue_count;
 	}
 
-	return is_ce ? data.ce_count : data.ue_count;
+	atomic_set(&con->ras_ce_count, ce_count);
+	atomic_set(&con->ras_ue_count, ue_count);
 }
 /* query/inject/cure end */
 
@@ -2131,6 +2134,44 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
 		adev->ras_hw_enabled & amdgpu_ras_mask;
 }
 
+static int amdgpu_ras_thread(void *data)
+{
+	struct amdgpu_device *adev = data;
+	struct amdgpu_ras    *con  = amdgpu_ras_get_context(adev);
+
+	con->ras_thread_poll_ms = amdgpu_ras_thread_poll_ms;
+	if (con->ras_thread_poll_ms < 500 ||
+	    con->ras_thread_poll_ms > 10000)
+		con->ras_thread_poll_ms = 3000;
+
+	while (1) {
+		if (kthread_should_stop())
+			break;
+		if (kthread_should_park())
+			kthread_parkme();
+
+		amdgpu_ras_query_error_count(adev);
+		msleep_interruptible(con->ras_thread_poll_ms);
+	}
+
+	return 0;
+}
+
+static int amdgpu_ras_thread_start(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct task_struct *kt;
+
+	kt = kthread_run(amdgpu_ras_thread, adev,
+			 "amdras:%s", pci_name(adev->pdev));
+	if (IS_ERR(kt))
+		return PTR_ERR(kt);
+
+	con->ras_thread = kt;
+
+	return 0;
+}
+
 int amdgpu_ras_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -2204,6 +2245,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 		goto release_con;
 	}
 
+	r = amdgpu_ras_thread_start(adev);
+	if (r)
+		goto release_con;
+
 	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
 		 "hardware ability[%x] ras_mask[%x]\n",
 		 adev->ras_hw_enabled, adev->ras_enabled);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 201fbdee1d09..fb9e4c7ab054 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -340,6 +340,11 @@ struct amdgpu_ras {
 
 	/* disable ras error count harvest in recovery */
 	bool disable_ras_err_cnt_harvest;
+
+	struct task_struct *ras_thread;
+	unsigned int        ras_thread_poll_ms;
+	atomic_t            ras_ue_count;
+	atomic_t            ras_ce_count;
 };
 
 struct ras_fs_data {
@@ -485,9 +490,6 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-		bool is_ce);
-
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 		struct eeprom_table_record *bps, int pages);
-- 
2.31.0.97.g1424303384