[PATCH] drm/amdgpu: correct ras query as part of ctx query
Guchun Chen
guchun.chen at amd.com
Thu Jun 11 10:24:23 UTC 2020
Almost error count registers are automatically cleared
after reading once, so both CE and UE count needs to be
read in one loop.
Signed-off-by: Guchun Chen <guchun.chen at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 16 +++++++---------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 ++--
3 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c06cb06398b1..29fa6b6b9d3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -335,7 +335,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
{
struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr;
- unsigned long ras_counter;
+ unsigned long ras_counter_ue, ras_counter_ce;
if (!fpriv)
return -EINVAL;
@@ -360,19 +360,17 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
- /*query ue count*/
- ras_counter = amdgpu_ras_query_error_count(adev, false);
+ /*query both ue and ce count*/
+ amdgpu_ras_query_error_count(adev, &ras_counter_ue, &ras_counter_ce);
/*ras counter is monotonic increasing*/
- if (ras_counter != ctx->ras_counter_ue) {
+ if (ras_counter_ue != ctx->ras_counter_ue) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
- ctx->ras_counter_ue = ras_counter;
+ ctx->ras_counter_ue = ras_counter_ue;
}
- /*query ce count*/
- ras_counter = amdgpu_ras_query_error_count(adev, true);
- if (ras_counter != ctx->ras_counter_ce) {
+ if (ras_counter_ce != ctx->ras_counter_ce) {
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
- ctx->ras_counter_ce = ras_counter;
+ ctx->ras_counter_ce = ras_counter_ce;
}
mutex_unlock(&mgr->lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 337bf2da7bdc..109eff2869b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -861,15 +861,18 @@ int amdgpu_ras_error_cure(struct amdgpu_device *adev,
}
/* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce)
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ue_cnt, unsigned long *ce_cnt)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
struct ras_err_data data = {0, 0};
+ *ue_cnt = 0;
+ *ce_cnt = 0;
+
if (!con)
- return 0;
+ return;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
@@ -877,13 +880,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
};
if (amdgpu_ras_error_query(adev, &info))
- return 0;
+ continue;
data.ce_count += info.ce_count;
data.ue_count += info.ue_count;
}
- return is_ce ? data.ce_count : data.ue_count;
+ *ue_cnt = data.ue_count;
+ *ce_cnt = data.ce_count;
}
/* query/inject/cure end */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index e7df5d8429f8..733eab5bc512 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -487,8 +487,8 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce);
+void amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ue_cnt, unsigned long *ce_cnt);
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
--
2.17.1
More information about the amd-gfx
mailing list