<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:10pt;color:#008000;margin:15pt;" align="Left">
[Public]<br>
</p>
<br>
<div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Reviewed-by: Alex Deucher <alexander.deucher@amd.com><br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Tuikov, Luben <Luben.Tuikov@amd.com><br>
<b>Sent:</b> Wednesday, May 26, 2021 12:43 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Cc:</b> Tuikov, Luben <Luben.Tuikov@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Clements, John <John.Clements@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com><br>
<b>Subject:</b> [PATCH 3/3] drm/amdgpu: Use delayed work to collect RAS error counters</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">On Context Query2 IOCTL return the correctable and<br>
uncorrectable errors in O(1) fashion, from cached<br>
values, and schedule a delayed work function to<br>
calculate and cache them for the next such IOCTL.<br>
<br>
v2: Cancel pending delayed work at ras_fini().<br>
v3: Remove conditionals when dealing with delayed<br>
work manipulation as they're inherently racy.<br>
<br>
Cc: Alexander Deucher <Alexander.Deucher@amd.com><br>
Cc: Christian König <christian.koenig@amd.com><br>
Cc: John Clements <john.clements@amd.com><br>
Cc: Hawking Zhang <Hawking.Zhang@amd.com><br>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com><br>
---<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++--<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 +++++++++++++++++++++++++<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++<br>
3 files changed, 74 insertions(+), 2 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c<br>
index bb0cfe871aba..e7a010b7ca1f 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c<br>
@@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,<br>
return 0;<br>
}<br>
<br>
+#define AMDGPU_RAS_COUNTE_DELAY_MS 3000<br>
+<br>
static int amdgpu_ctx_query2(struct amdgpu_device *adev,<br>
- struct amdgpu_fpriv *fpriv, uint32_t id,<br>
- union drm_amdgpu_ctx_out *out)<br>
+ struct amdgpu_fpriv *fpriv, uint32_t id,<br>
+ union drm_amdgpu_ctx_out *out)<br>
{<br>
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);<br>
struct amdgpu_ctx *ctx;<br>
struct amdgpu_ctx_mgr *mgr;<br>
<br>
@@ -361,6 +364,30 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,<br>
if (atomic_read(&ctx->guilty))<br>
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;<br>
<br>
+ if (adev->ras_enabled && con) {<br>
+ /* Return the cached values in O(1),<br>
+ * and schedule delayed work to cache<br>
+ * new vaues.<br>
+ */<br>
+ int ce_count, ue_count;<br>
+<br>
+ ce_count = atomic_read(&con->ras_ce_count);<br>
+ ue_count = atomic_read(&con->ras_ue_count);<br>
+<br>
+ if (ce_count != ctx->ras_counter_ce) {<br>
+ ctx->ras_counter_ce = ce_count;<br>
+ out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;<br>
+ }<br>
+<br>
+ if (ue_count != ctx->ras_counter_ue) {<br>
+ ctx->ras_counter_ue = ue_count;<br>
+ out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;<br>
+ }<br>
+<br>
+ schedule_delayed_work(&con->ras_counte_delay_work,<br>
+ msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS));<br>
+ }<br>
+<br>
mutex_unlock(&mgr->lock);<br>
return 0;<br>
}<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
index ed3c43e8b0b5..ec936cde2726 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c<br>
@@ -27,6 +27,7 @@<br>
#include <linux/uaccess.h><br>
#include <linux/reboot.h><br>
#include <linux/syscalls.h><br>
+#include <linux/pm_runtime.h><br>
<br>
#include "amdgpu.h"<br>
#include "amdgpu_ras.h"<br>
@@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)<br>
adev->ras_hw_enabled & amdgpu_ras_mask;<br>
}<br>
<br>
+static void amdgpu_ras_counte_dw(struct work_struct *work)<br>
+{<br>
+ struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,<br>
+ ras_counte_delay_work.work);<br>
+ struct amdgpu_device *adev = con->adev;<br>
+ struct drm_device *dev = &adev->ddev;<br>
+ unsigned long ce_count, ue_count;<br>
+ int res;<br>
+<br>
+ res = pm_runtime_get_sync(dev->dev);<br>
+ if (res < 0)<br>
+ goto Out;<br>
+<br>
+ /* Cache new values.<br>
+ */<br>
+ amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);<br>
+ atomic_set(&con->ras_ce_count, ce_count);<br>
+ atomic_set(&con->ras_ue_count, ue_count);<br>
+<br>
+ pm_runtime_mark_last_busy(dev->dev);<br>
+Out:<br>
+ pm_runtime_put_autosuspend(dev->dev);<br>
+}<br>
+<br>
int amdgpu_ras_init(struct amdgpu_device *adev)<br>
{<br>
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);<br>
@@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)<br>
if (!con)<br>
return -ENOMEM;<br>
<br>
+ con->adev = adev;<br>
+ INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);<br>
+ atomic_set(&con->ras_ce_count, 0);<br>
+ atomic_set(&con->ras_ue_count, 0);<br>
+<br>
con->objs = (struct ras_manager *)(con + 1);<br>
<br>
amdgpu_ras_set_context(adev, con);<br>
@@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,<br>
struct ras_fs_if *fs_info,<br>
struct ras_ih_if *ih_info)<br>
{<br>
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);<br>
+ unsigned long ue_count, ce_count;<br>
int r;<br>
<br>
/* disable RAS feature per IP block if it is not supported */<br>
@@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,<br>
if (r)<br>
goto sysfs;<br>
<br>
+ /* Those are the cached values at init.<br>
+ */<br>
+ amdgpu_ras_query_error_count(adev, &ce_count, &ue_count);<br>
+ atomic_set(&con->ras_ce_count, ce_count);<br>
+ atomic_set(&con->ras_ue_count, ue_count);<br>
+<br>
return 0;<br>
cleanup:<br>
amdgpu_ras_sysfs_remove(adev, ras_block);<br>
@@ -2390,6 +2428,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)<br>
if (con->features)<br>
amdgpu_ras_disable_all_features(adev, 1);<br>
<br>
+ cancel_delayed_work_sync(&con->ras_counte_delay_work);<br>
+<br>
amdgpu_ras_set_context(adev, NULL);<br>
kfree(con);<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
index 10fca0393106..256cea5d34f2 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h<br>
@@ -340,6 +340,11 @@ struct amdgpu_ras {<br>
<br>
/* disable ras error count harvest in recovery */<br>
bool disable_ras_err_cnt_harvest;<br>
+<br>
+ /* RAS count errors delayed work */<br>
+ struct delayed_work ras_counte_delay_work;<br>
+ atomic_t ras_ue_count;<br>
+ atomic_t ras_ce_count;<br>
};<br>
<br>
struct ras_fs_data {<br>
-- <br>
2.31.1.527.g2d677e5b15<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>