[PATCH] drm/amdgpu: fix refcount underflow in device reset
Yiqing Yao
yiqing.yao at amd.com
Wed Jun 1 13:35:23 UTC 2022
[why]
A gfx job may be processed but not finished when reset begin from
compute job timeout. drm_sched_resubmit_jobs_ext in sched_main
assume submitted job unsignaled and always put parent fence.
Resubmission for that job cause underflow. This fix is done in
device reset to avoid changing drm sched_main.
[how]
Check if the job to submit has signaled and avoid submission if
signaled in device reset for both advanced TDR and normal job
resume.
Signed-off-by: Yiqing Yao <yiqing.yao at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 72 ++++++++++++----------
1 file changed, 41 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f16f105a737b..29b307af97eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4980,39 +4980,43 @@ static void amdgpu_device_recheck_guilty_jobs(
/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
* to make sure fence is balanced */
dma_fence_get(s_job->s_fence->parent);
- drm_sched_resubmit_jobs_ext(&ring->sched, 1);
- ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
- if (ret == 0) { /* timeout */
- DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
- ring->sched.name, s_job->id);
+ /* avoid submission for signaled hw fence */
+ if(!dma_fence_is_signaled(s_job->s_fence->parent)){
- /* set guilty */
- drm_sched_increase_karma(s_job);
+ drm_sched_resubmit_jobs_ext(&ring->sched, 1);
+
+ ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
+ if (ret == 0) { /* timeout */
+ DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
+ ring->sched.name, s_job->id);
+
+ /* set guilty */
+ drm_sched_increase_karma(s_job);
retry:
- /* do hw reset */
- if (amdgpu_sriov_vf(adev)) {
- amdgpu_virt_fini_data_exchange(adev);
- r = amdgpu_device_reset_sriov(adev, false);
- if (r)
- adev->asic_reset_res = r;
- } else {
- clear_bit(AMDGPU_SKIP_HW_RESET,
- &reset_context->flags);
- r = amdgpu_do_asic_reset(device_list_handle,
- reset_context);
- if (r && r == -EAGAIN)
- goto retry;
- }
+ /* do hw reset */
+ if (amdgpu_sriov_vf(adev)) {
+ amdgpu_virt_fini_data_exchange(adev);
+ r = amdgpu_device_reset_sriov(adev, false);
+ if (r)
+ adev->asic_reset_res = r;
+ } else {
+ clear_bit(AMDGPU_SKIP_HW_RESET,
+ &reset_context->flags);
+ r = amdgpu_do_asic_reset(device_list_handle,
+ reset_context);
+ if (r && r == -EAGAIN)
+ goto retry;
+ }
- /*
- * add reset counter so that the following
- * resubmitted job could flush vmid
- */
- atomic_inc(&adev->gpu_reset_counter);
- continue;
+ /*
+ * add reset counter so that the following
+ * resubmitted job could flush vmid
+ */
+ atomic_inc(&adev->gpu_reset_counter);
+ continue;
+ }
}
-
/* got the hw fence, signal finished fence */
atomic_dec(ring->sched.score);
dma_fence_put(s_job->s_fence->parent);
@@ -5221,13 +5225,19 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
-
+ struct drm_sched_job *s_job;
if (!ring || !ring->sched.thread)
continue;
+ s_job = list_first_entry_or_null(&ring->sched.pending_list,
+ struct drm_sched_job, list);
+
+ if(s_job){
/* No point to resubmit jobs if we didn't HW reset*/
- if (!tmp_adev->asic_reset_res && !job_signaled)
- drm_sched_resubmit_jobs(&ring->sched);
+ if (!tmp_adev->asic_reset_res && !job_signaled
+ && !dma_fence_is_signaled(s_job->s_fence->parent))
+ drm_sched_resubmit_jobs(&ring->sched);
+ }
drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
}
--
2.25.1
More information about the amd-gfx
mailing list