[PATCHv2 2/2] drm/amd/amdgpu: add tdr support for embeded hw_fence

Mon Aug 9 16:24:37 UTC 2021

On 2021-08-05 4:31 a.m., Jingwen Chen wrote:
> [Why]
> After embeded hw_fence to amdgpu_job, we need to add tdr support
> for this feature.
>
> [How]
> 1. Add a resubmit_flag for resubmit jobs.
> 2. Clear job fence from RCU and force complete vm flush fences in
>     pre_asic_reset
> 3. skip dma_fence_get for resubmit jobs and add a dma_fence_put
>     for guilty jobs.
> v2:
> use a job_run_counter in amdgpu_job to replace the resubmit_flag in
> drm_sched_job. When the job_run_counter >= 1, it means this job is a
> resubmit job.
>
> Signed-off-by: Jack Zhang <Jack.Zhang7 at hotmail.com>
> Signed-off-by: Jingwen Chen <Jingwen.Chen2 at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 13 +++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  5 ++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 +++
>   4 files changed, 27 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 9e53ff851496..ade2fa07a50a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4447,7 +4447,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
>   int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   				 struct amdgpu_reset_context *reset_context)
>   {
> -	int i, r = 0;
> +	int i, j, r = 0;
>   	struct amdgpu_job *job = NULL;
>   	bool need_full_reset =
>   		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
> @@ -4471,6 +4471,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>   		if (!ring || !ring->sched.thread)
>   			continue;
>   
> +		/*clear job fence from fence drv to avoid force_completion
> +		 *leave NULL and vm flush fence in fence drv */
> +		for (j = 0; j <= ring->fence_drv.num_fences_mask; j ++) {
> +			struct dma_fence *old,**ptr;
> +			ptr = &ring->fence_drv.fences[j];
> +			old = rcu_dereference_protected(*ptr, 1);
> +			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
> +				RCU_INIT_POINTER(*ptr, NULL);
> +			}
> +		}
>   		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>   		amdgpu_fence_driver_force_completion(ring);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 5e29d797a265..c9752cf794fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -159,10 +159,15 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
>   	}
>   
>   	seq = ++ring->fence_drv.sync_seq;
> -	dma_fence_init(fence, &amdgpu_fence_ops,
> -		       &ring->fence_drv.lock,
> -		       adev->fence_context + ring->idx,
> -		       seq);
> +	if (job != NULL && job->job_run_counter) {
> +		/* reinit seq for resubmitted jobs */
> +		fence->seqno = seq;
> +	} else {
> +		dma_fence_init(fence, &amdgpu_fence_ops,
> +				&ring->fence_drv.lock,
> +				adev->fence_context + ring->idx,
> +				seq);
> +	}

I think this should be in the first patch actually (and the counter too),
without it the first patch is buggy.

>   
>   	if (job != NULL) {
>   		/* mark this fence has a parent job */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 65a395060de2..19b13a65c73b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -254,6 +254,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   		dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
>   
>   	if (finished->error < 0) {
> +		dma_fence_put(&job->hw_fence);

Would put this check bellow with the job_run_counter check

>   		DRM_INFO("Skip scheduling IBs!\n");
>   	} else {
>   		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
> @@ -262,7 +263,9 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>   			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>   	}
>   
> -	dma_fence_get(fence);
> +	if (!job->job_run_counter)
> +		dma_fence_get(fence);
> +	job->job_run_counter ++;
>   	amdgpu_job_free_resources(job);

Here you  modify code you already changed in patch 1, looks to me
like those 2 patches should be squashed into one patch as the changes are
directly dependent and it's hard to follow

Andrey

>   
>   	fence = r ? ERR_PTR(r) : fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 92324c978534..1fa667f245e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -64,6 +64,9 @@ struct amdgpu_job {
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
> +
> +	/* job_run_counter >= 1 means a resubmit job */
> +	uint32_t		job_run_counter;
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,