[PATCH] drm/amdgpu: fix refcount underflow in device reset

Mon Jun 6 17:18:12 UTC 2022

On 2022-06-06 03:43, Yiqing Yao wrote:
> [why]
> A gfx job may be processed but not finished when reset begin from
> compute job timeout. drm_sched_resubmit_jobs_ext in sched_main
> assume submitted job unsignaled and always put parent fence.
> Resubmission for that job cause underflow. This fix is done in
> device reset to avoid changing drm sched_main.

Are we talking about amdgpu_fence_process sneaking in here just before 
you call
drm_sched_resubmit_jobs_ext->dma_fence_put(parent) and doing extra put ?

How about first remove the fence in question from &drv->fences like
it's done in

>
> [how]
> Check if the job to submit has signaled and avoid submission if
> signaled in device reset for both advanced TDR and normal job
> resume.

If what i said above is the problem then this is a racy solution no ?
The fence can signal right after your check anyway.

How about first removing this fence from  drv->fences array and only then
checking if it's signaled or not. If signaled you can just skip 
resubmission and
otherwise you can go ahead and call resubmit and not worry about double 
put.

Andrey

>
> Signed-off-by: Yiqing Yao <yiqing.yao at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 72 ++++++++++++----------
>   1 file changed, 41 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f16f105a737b..29b307af97eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4980,39 +4980,43 @@ static void amdgpu_device_recheck_guilty_jobs(
>   		/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
>   		 * to make sure fence is balanced */
>   		dma_fence_get(s_job->s_fence->parent);
> -		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
>   
> -		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> -		if (ret == 0) { /* timeout */
> -			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
> -						ring->sched.name, s_job->id);
> +		/* avoid submission for signaled hw fence */
> +		if(!dma_fence_is_signaled(s_job->s_fence->parent)){
>   
> -			/* set guilty */
> -			drm_sched_increase_karma(s_job);
> +			drm_sched_resubmit_jobs_ext(&ring->sched, 1);
> +
> +			ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
> +			if (ret == 0) { /* timeout */
> +				DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
> +							ring->sched.name, s_job->id);
> +
> +				/* set guilty */
> +				drm_sched_increase_karma(s_job);
>   retry:
> -			/* do hw reset */
> -			if (amdgpu_sriov_vf(adev)) {
> -				amdgpu_virt_fini_data_exchange(adev);
> -				r = amdgpu_device_reset_sriov(adev, false);
> -				if (r)
> -					adev->asic_reset_res = r;
> -			} else {
> -				clear_bit(AMDGPU_SKIP_HW_RESET,
> -					  &reset_context->flags);
> -				r = amdgpu_do_asic_reset(device_list_handle,
> -							 reset_context);
> -				if (r && r == -EAGAIN)
> -					goto retry;
> -			}
> +				/* do hw reset */
> +				if (amdgpu_sriov_vf(adev)) {
> +					amdgpu_virt_fini_data_exchange(adev);
> +					r = amdgpu_device_reset_sriov(adev, false);
> +					if (r)
> +						adev->asic_reset_res = r;
> +				} else {
> +					clear_bit(AMDGPU_SKIP_HW_RESET,
> +						&reset_context->flags);
> +					r = amdgpu_do_asic_reset(device_list_handle,
> +								reset_context);
> +					if (r && r == -EAGAIN)
> +						goto retry;
> +				}
>   
> -			/*
> -			 * add reset counter so that the following
> -			 * resubmitted job could flush vmid
> -			 */
> -			atomic_inc(&adev->gpu_reset_counter);
> -			continue;
> +				/*
> +				* add reset counter so that the following
> +				* resubmitted job could flush vmid
> +				*/
> +				atomic_inc(&adev->gpu_reset_counter);
> +				continue;
> +			}
>   		}
> -
>   		/* got the hw fence, signal finished fence */
>   		atomic_dec(ring->sched.score);
>   		dma_fence_put(s_job->s_fence->parent);
> @@ -5221,13 +5225,19 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
>   
>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>   			struct amdgpu_ring *ring = tmp_adev->rings[i];
> -
> +			struct drm_sched_job *s_job;
>   			if (!ring || !ring->sched.thread)
>   				continue;
>   
> +			s_job = list_first_entry_or_null(&ring->sched.pending_list,
> +				struct drm_sched_job, list);
> +
> +			if(s_job){
>   			/* No point to resubmit jobs if we didn't HW reset*/
> -			if (!tmp_adev->asic_reset_res && !job_signaled)
> -				drm_sched_resubmit_jobs(&ring->sched);
> +				if (!tmp_adev->asic_reset_res && !job_signaled
> +					&& !dma_fence_is_signaled(s_job->s_fence->parent))
> +					drm_sched_resubmit_jobs(&ring->sched);
> +			}
>   
>   			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>   		}