[PATCH 2/2] drm: add tdr support for embeded hw_fence

Fri Jul 23 08:45:49 UTC 2021

Am 23.07.21 um 09:07 schrieb Jingwen Chen:
> [SNIP]
> Hi Christian,
>
> The thing is vm flush fence has no job passed to amdgpu_fence_emit, so
> go through the jobs cannot help find the vm flush fence. And keep the
> rest fences in the RCU array will lead to signaling them in the ib_test
> right after ASIC reset. While they will be signaled again during
> resubmission. What I'm doning here is just trying to cleanup the fences
> without a parent job and make sure the rest fences won't be signaled
> twice.

It took me a moment to realize what you are talking about here.

This is for the KIQ! You need different handling for the KIQ than for 
the scheduler controlled rings.

It is not only the flush jobs, but the KIQ needs a complete reset 
because of the register writes pushed there as well.

>> And please drop any usage of DMA_FENCE_FLAG_USER_BITS. That is not something
>> which should be abused for reset handling.
>>
> The DMA_FENCE_FLAG_USER_BITS here is to mark this fence has a parent
> job. If this is not a proper usage here, do you have any suggestions
> about how to identify whether the fence has a parent job?

You don't need to mark the fences at all. Everything on the KIQ ring 
needs to be force completed since none of the fences on that ring have a 
parent job.

Christian.

>
> Best Regards,
> JingWen Chen
>> Regards,
>> Christian.
>>
>>>
>>> Best Regards,
>>> JingWen Chen
>>>>> Andrey
>>>>>
>>>>>
>>>>>>>> +        }
>>>>>>>>              /* after all hw jobs are reset, hw fence is
>>>>>>>> meaningless, so force_completion */
>>>>>>>>              amdgpu_fence_driver_force_completion(ring);
>>>>>>>>          }
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> index eecf21d8ec33..815776c9a013 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>>>>> @@ -156,11 +156,17 @@ int amdgpu_fence_emit(struct
>>>>>>>> amdgpu_ring *ring, struct dma_fence **f, struct amd
>>>>>>>>              job->ring = ring;
>>>>>>>>          }
>>>>>>>> -    seq = ++ring->fence_drv.sync_seq;
>>>>>>>> -    dma_fence_init(fence, &amdgpu_fence_ops,
>>>>>>>> -               &ring->fence_drv.lock,
>>>>>>>> -               adev->fence_context + ring->idx,
>>>>>>>> -               seq);
>>>>>>>> +    if (job != NULL && job->base.resubmit_flag == 1) {
>>>>>>>> +        /* reinit seq for resubmitted jobs */
>>>>>>>> +        seq = ++ring->fence_drv.sync_seq;
>>>>>>>> +        fence->seqno = seq;
>>>>>>>> +    } else {
>>>>>>>> +        seq = ++ring->fence_drv.sync_seq;
>>>>>>> Seems like you could do the above line only once above if-else
>>>>>>> as it was
>>>>>>> before
>>>>>> Sure, I will modify this.
>>>>>>
>>>>>>
>>>>>> Best Regards,
>>>>>> JingWen Chen
>>>>>>>> +        dma_fence_init(fence, &amdgpu_fence_ops,
>>>>>>>> +                &ring->fence_drv.lock,
>>>>>>>> +                adev->fence_context + ring->idx,
>>>>>>>> +                seq);
>>>>>>>> +    }
>>>>>>>>          if (job != NULL) {
>>>>>>>>              /* mark this fence has a parent job */
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>>>> index 7c426e225b24..d6f848adc3f4 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>>>> @@ -241,6 +241,7 @@ static struct dma_fence
>>>>>>>> *amdgpu_job_run(struct drm_sched_job *sched_job)
>>>>>>>>              dma_fence_set_error(finished, -ECANCELED);/* skip
>>>>>>>> IB as well if VRAM lost */
>>>>>>>>          if (finished->error < 0) {
>>>>>>>> +        dma_fence_put(&job->hw_fence);
>>>>>>>>              DRM_INFO("Skip scheduling IBs!\n");
>>>>>>>>          } else {
>>>>>>>>              r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>>>>>>>> @@ -249,7 +250,8 @@ static struct dma_fence
>>>>>>>> *amdgpu_job_run(struct drm_sched_job *sched_job)
>>>>>>>>                  DRM_ERROR("Error scheduling IBs (%d)\n", r);
>>>>>>>>          }
>>>>>>>> -    dma_fence_get(fence);
>>>>>>>> +    if (!job->base.resubmit_flag)
>>>>>>>> +        dma_fence_get(fence);
>>>>>>>>          amdgpu_job_free_resources(job);
>>>>>>>>          fence = r ? ERR_PTR(r) : fence;
>>>>>>>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> index f4f474944169..5a36ab5aea2d 100644
>>>>>>>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>>>>>>>> @@ -544,6 +544,7 @@ void drm_sched_resubmit_jobs_ext(struct
>>>>>>>> drm_gpu_scheduler *sched, int max)
>>>>>>>> dma_fence_set_error(&s_fence->finished, -ECANCELED);
>>>>>>>>              dma_fence_put(s_job->s_fence->parent);
>>>>>>>> +        s_job->resubmit_flag = 1;
>>>>>>>>              fence = sched->ops->run_job(s_job);
>>>>>>>>              i++;
>>>>>>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>>>>>>> index 4ea8606d91fe..06c101af1f71 100644
>>>>>>>> --- a/include/drm/gpu_scheduler.h
>>>>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>>>>> @@ -198,6 +198,7 @@ struct drm_sched_job {
>>>>>>>>          enum drm_sched_priority        s_priority;
>>>>>>>>          struct drm_sched_entity         *entity;
>>>>>>>>          struct dma_fence_cb        cb;
>>>>>>>> +    int resubmit_flag;
>>>>>>>>      };
>>>>>>>>      static inline bool drm_sched_invalidate_job(struct
>>>>>>>> drm_sched_job *s_job,