[PATCH] drm/amdgpu: correct ras query as part of ctx query

Li, Dennis Dennis.Li at amd.com
Thu Jun 11 10:56:58 UTC 2020


[AMD Official Use Only - Internal Distribution Only]

Hi, Guchun,
     The ras_manager obj will save the error counters in every querying, therefore the previous querying shouldn't affect the result of current querying. Please check the function: amdgpu_ras_error_query. 
 
Best Regards
Dennis Li
-----Original Message-----
From: Chen, Guchun <Guchun.Chen at amd.com> 
Sent: Thursday, June 11, 2020 6:24 PM
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Pan, Xinhui <Xinhui.Pan at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Clements, John <John.Clements at amd.com>
Cc: Chen, Guchun <Guchun.Chen at amd.com>
Subject: [PATCH] drm/amdgpu: correct ras query as part of ctx query

Almost error count registers are automatically cleared after reading once, so both CE and UE count needs to be read in one loop.

Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 +++++++---------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++++++-----  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 ++--
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c06cb06398b1..29fa6b6b9d3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -335,7 +335,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,  {
 	struct amdgpu_ctx *ctx;
 	struct amdgpu_ctx_mgr *mgr;
-	unsigned long ras_counter;
+	unsigned long ras_counter_ue, ras_counter_ce;
 
 	if (!fpriv)
 		return -EINVAL;
@@ -360,19 +360,17 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
 	if (atomic_read(&ctx->guilty))
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
 
-	/*query ue count*/
-	ras_counter = amdgpu_ras_query_error_count(adev, false);
+	/*query both ue and ce count*/
+	amdgpu_ras_query_error_count(adev, &ras_counter_ue, &ras_counter_ce);
 	/*ras counter is monotonic increasing*/
-	if (ras_counter != ctx->ras_counter_ue) {
+	if (ras_counter_ue != ctx->ras_counter_ue) {
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
-		ctx->ras_counter_ue = ras_counter;
+		ctx->ras_counter_ue = ras_counter_ue;
 	}
 
-	/*query ce count*/
-	ras_counter = amdgpu_ras_query_error_count(adev, true);
-	if (ras_counter != ctx->ras_counter_ce) {
+	if (ras_counter_ce != ctx->ras_counter_ce) {
 		out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
-		ctx->ras_counter_ce = ras_counter;
+		ctx->ras_counter_ce = ras_counter_ce;
 	}
 
 	mutex_unlock(&mgr->lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 337bf2da7bdc..109eff2869b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -861,15 +861,18 @@ int amdgpu_ras_error_cure(struct amdgpu_device *adev,  }
 
 /* get the total error counts on all IPs */ -unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-		bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+		unsigned long *ue_cnt, unsigned long *ce_cnt)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_manager *obj;
 	struct ras_err_data data = {0, 0};
 
+	*ue_cnt = 0;
+	*ce_cnt = 0;
+
 	if (!con)
-		return 0;
+		return;
 
 	list_for_each_entry(obj, &con->head, node) {
 		struct ras_query_if info = {
@@ -877,13 +880,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 		};
 
 		if (amdgpu_ras_error_query(adev, &info))
-			return 0;
+			continue;
 
 		data.ce_count += info.ce_count;
 		data.ue_count += info.ue_count;
 	}
 
-	return is_ce ? data.ce_count : data.ue_count;
+	*ue_cnt = data.ue_count;
+	*ce_cnt = data.ce_count;
 }
 /* query/inject/cure end */
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index e7df5d8429f8..733eab5bc512 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -487,8 +487,8 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,  void amdgpu_ras_resume(struct amdgpu_device *adev);  void amdgpu_ras_suspend(struct amdgpu_device *adev);
 
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
-		bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+		unsigned long *ue_cnt, unsigned long *ce_cnt);
 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
--
2.17.1


More information about the amd-gfx mailing list