<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:10pt;color:#0000FF;margin:5pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
[AMD Official Use Only - General]<br>
</p>
<br>
<div>
<div style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);" class="elementToProof">
Ping?</div>
<div style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);" class="elementToProof">
<br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Deucher, Alexander <Alexander.Deucher@amd.com><br>
<b>Sent:</b> Thursday, January 25, 2024 11:15 AM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Cc:</b> Ma, Jun <Jun.Ma2@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; Prosyak, Vitaly <Vitaly.Prosyak@amd.com><br>
<b>Subject:</b> [PATCH] drm/amdgpu: Fix the warning info in mode1 reset</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">From: Ma Jun <Jun.Ma2@amd.com><br>
<br>
Fix the warning info below during mode1 reset.<br>
[ +0.000004] Call Trace:<br>
[ +0.000004] <TASK><br>
[ +0.000006] ? show_regs+0x6e/0x80<br>
[ +0.000011] ? __flush_work.isra.0+0x2e8/0x390<br>
[ +0.000005] ? __warn+0x91/0x150<br>
[ +0.000009] ? __flush_work.isra.0+0x2e8/0x390<br>
[ +0.000006] ? report_bug+0x19d/0x1b0<br>
[ +0.000013] ? handle_bug+0x46/0x80<br>
[ +0.000012] ? exc_invalid_op+0x1d/0x80<br>
[ +0.000011] ? asm_exc_invalid_op+0x1f/0x30<br>
[ +0.000014] ? __flush_work.isra.0+0x2e8/0x390<br>
[ +0.000007] ? __flush_work.isra.0+0x208/0x390<br>
[ +0.000007] ? _prb_read_valid+0x216/0x290<br>
[ +0.000008] __cancel_work_timer+0x11d/0x1a0<br>
[ +0.000007] ? try_to_grab_pending+0xe8/0x190<br>
[ +0.000012] cancel_work_sync+0x14/0x20<br>
[ +0.000008] amddrm_sched_stop+0x3c/0x1d0 [amd_sched]<br>
[ +0.000032] amdgpu_device_gpu_recover+0x29a/0xe90 [amdgpu]<br>
<br>
This warning info was printed after applying the patch<br>
"drm/sched: Convert drm scheduler to use a work queue rather than kthread".<br>
The root cause is that amdgpu driver tries to use the uninitialized<br>
work_struct in the struct drm_gpu_scheduler<br>
<br>
v2:<br>
- Rename the function to amdgpu_ring_sched_ready and move it to<br>
amdgpu_ring.c (Alex)<br>
v3:<br>
- Fix a few more checks based on Vitaly's patch (Alex)<br>
<br>
Fixes: 11b3b9f461c5 ("drm/sched: Check scheduler ready before calling timeout handling")<br>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com><br>
Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com><br>
Signed-off-by: Ma Jun <Jun.Ma2@amd.com><br>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com><br>
---<br>
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +-<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 8 ++++----<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++------<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 14 +++++++++++++-<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 +-<br>
5 files changed, 25 insertions(+), 13 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c<br>
index 899e31e3a5e8..3a3f3ce09f00 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c<br>
@@ -290,7 +290,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus<br>
for (i = 0; i < adev->gfx.num_compute_rings; i++) {<br>
struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];<br>
<br>
- if (!(ring && drm_sched_wqueue_ready(&ring->sched)))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
/* stop secheduler and drain ring. */<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c<br>
index e485dd3357c6..1afbb2e932c6 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c<br>
@@ -1678,7 +1678,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)<br>
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {<br>
struct amdgpu_ring *ring = adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
drm_sched_wqueue_stop(&ring->sched);<br>
}<br>
@@ -1694,7 +1694,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)<br>
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {<br>
struct amdgpu_ring *ring = adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
drm_sched_wqueue_start(&ring->sched);<br>
}<br>
@@ -1916,8 +1916,8 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)<br>
<br>
ring = adev->rings[val];<br>
<br>
- if (!ring || !ring->funcs->preempt_ib ||<br>
- !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring) ||<br>
+ !ring->funcs->preempt_ib)<br>
return -EINVAL;<br>
<br>
/* the last preemption failed */<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index 1a04ccba9542..7ff17df7a5ce 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -5042,7 +5042,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)<br>
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
struct amdgpu_ring *ring = adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
spin_lock(&ring->sched.job_list_lock);<br>
@@ -5181,7 +5181,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,<br>
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
struct amdgpu_ring *ring = adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
/* Clear job fence from fence drv to avoid force_completion<br>
@@ -5648,7 +5648,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
struct amdgpu_ring *ring = tmp_adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
drm_sched_stop(&ring->sched, job ? &job->base : NULL);<br>
@@ -5717,7 +5717,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
struct amdgpu_ring *ring = tmp_adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
drm_sched_start(&ring->sched, true);<br>
@@ -6072,7 +6072,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta<br>
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
struct amdgpu_ring *ring = adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
drm_sched_stop(&ring->sched, NULL);<br>
@@ -6214,7 +6214,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)<br>
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {<br>
struct amdgpu_ring *ring = adev->rings[i];<br>
<br>
- if (!ring || !drm_sched_wqueue_ready(&ring->sched))<br>
+ if (!amdgpu_ring_sched_ready(ring))<br>
continue;<br>
<br>
drm_sched_start(&ring->sched, true);<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c<br>
index 45424ebf9681..9ae386e9d41d 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c<br>
@@ -634,7 +634,8 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)<br>
DRM_DEV_DEBUG(adev->dev, "ring test on %s succeeded\n",<br>
ring->name);<br>
<br>
- ring->sched.ready = !r;<br>
+ if (!ring->no_scheduler)<br>
+ ring->sched.ready = !r;<br>
return r;<br>
}<br>
<br>
@@ -717,3 +718,14 @@ void amdgpu_ring_ib_on_emit_de(struct amdgpu_ring *ring)<br>
if (ring->is_sw_ring)<br>
amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_DE);<br>
}<br>
+<br>
+bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)<br>
+{<br>
+ if (!ring)<br>
+ return false;<br>
+<br>
+ if (ring->no_scheduler || !drm_sched_wqueue_ready(&ring->sched))<br>
+ return false;<br>
+<br>
+ return true;<br>
+}<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h<br>
index bbb53720a018..fe1a61eb6e4c 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h<br>
@@ -450,5 +450,5 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,<br>
int amdgpu_ib_pool_init(struct amdgpu_device *adev);<br>
void amdgpu_ib_pool_fini(struct amdgpu_device *adev);<br>
int amdgpu_ib_ring_tests(struct amdgpu_device *adev);<br>
-<br>
+bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);<br>
#endif<br>
-- <br>
2.42.0<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>