[PATCH 5/5] drm/amdgpu: Follow up change to previous drm scheduler change.

Christian König ckoenig.leichtzumerken at gmail.com
Tue Jun 21 07:28:25 UTC 2022


Am 21.06.22 um 00:03 schrieb Andrey Grodzovsky:
> Align refcount behaviour for amdgpu_job embedded HW fence with
> classic pointer style HW fences by increasing refcount each
> time emit is called so amdgpu code doesn't need to make workarounds
> using amdgpu_job.job_run_counter to keep the HW fence refcount balanced.

Could we now also remove job_run_counter?

Christian.

>
> Also since in the previous patch we resumed setting s_fence->parent to NULL
> in drm_sched_stop switch to directly checking if job->hw_fence is
> signaled to short circuit reset if already signed.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
> Tested-by: Yiqing Yao <yiqing.yao at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++++++++------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  7 ++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  4 ----
>   4 files changed, 25 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 513c57f839d8..447bd92c4856 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
>   		goto err_ib_sched;
>   	}
>   
> +	/* Drop the initial kref_init count (see drm_sched_main as example) */
> +	dma_fence_put(f);
>   	ret = dma_fence_wait(f, false);
>   
>   err_ib_sched:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c99541685804..f9718119834f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5009,16 +5009,28 @@ static void amdgpu_device_recheck_guilty_jobs(
>   
>   		/* clear job's guilty and depend the folowing step to decide the real one */
>   		drm_sched_reset_karma(s_job);
> -		/* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
> -		 * to make sure fence is balanced */
> -		dma_fence_get(s_job->s_fence->parent);
>   		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
>   
> +		if (!s_job->s_fence->parent) {
> +			DRM_WARN("Failed to get a HW fence for job!");
> +			continue;
> +		}
> +
>   		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
>   		if (ret == 0) { /* timeout */
>   			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
>   						ring->sched.name, s_job->id);
>   
> +
> +			/* Clear this failed job from fence array */
> +			amdgpu_fence_driver_clear_job_fences(ring);
> +
> +			/* Since the job won't signal and we go for
> +			 * another resubmit drop this parent pointer
> +			 */
> +			dma_fence_put(s_job->s_fence->parent);
> +			s_job->s_fence->parent = NULL;
> +
>   			/* set guilty */
>   			drm_sched_increase_karma(s_job);
>   retry:
> @@ -5047,7 +5059,6 @@ static void amdgpu_device_recheck_guilty_jobs(
>   
>   		/* got the hw fence, signal finished fence */
>   		atomic_dec(ring->sched.score);
> -		dma_fence_put(s_job->s_fence->parent);
>   		dma_fence_get(&s_job->s_fence->finished);
>   		dma_fence_signal(&s_job->s_fence->finished);
>   		dma_fence_put(&s_job->s_fence->finished);
> @@ -5220,8 +5231,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   	 *
>   	 * job->base holds a reference to parent fence
>   	 */
> -	if (job && job->base.s_fence->parent &&
> -	    dma_fence_is_signaled(job->base.s_fence->parent)) {
> +	if (job && (job->hw_fence.ops != NULL) &&
> +	    dma_fence_is_signaled(&job->hw_fence)) {
>   		job_signaled = true;
>   		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>   		goto skip_hw_reset;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index d6d54ba4c185..9bd4e18212fc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
>   	if (job && job->job_run_counter) {
>   		/* reinit seq for resubmitted jobs */
>   		fence->seqno = seq;
> +		/* TO be inline with external fence creation and other drivers */
> +		dma_fence_get(fence);
>   	} else {
> -		if (job)
> +		if (job) {
>   			dma_fence_init(fence, &amdgpu_job_fence_ops,
>   				       &ring->fence_drv.lock,
>   				       adev->fence_context + ring->idx, seq);
> +			/* Against remove in amdgpu_job_{free, free_cb} */
> +			dma_fence_get(fence);
> +		}
>   		else
>   			dma_fence_init(fence, &amdgpu_fence_ops,
>   				       &ring->fence_drv.lock,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 58568fdde2d0..638e1d600258 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -267,10 +267,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>   	}
>   
> -	if (!job->job_run_counter)
> -		dma_fence_get(fence);
> -	else if (finished->error < 0)
> -		dma_fence_put(&job->hw_fence);
>   	job->job_run_counter++;
>   	amdgpu_job_free_resources(job);
>   



More information about the dri-devel mailing list