[PATCH 5/5] drm/amdgpu: Follow up change to previous drm scheduler change.

Tue Jun 21 20:00:19 UTC 2022

On 2022-06-21 03:28, Christian König wrote:
> Am 21.06.22 um 00:03 schrieb Andrey Grodzovsky:
>> Align refcount behaviour for amdgpu_job embedded HW fence with
>> classic pointer style HW fences by increasing refcount each
>> time emit is called so amdgpu code doesn't need to make workarounds
>> using amdgpu_job.job_run_counter to keep the HW fence refcount balanced.
>
> Could we now also remove job_run_counter?
>
> Christian.

I am afraid not, job counter is needed since at all times the refcount 
on the
embedded fence cannot drop to zero because this will free the job itself 
before
the end of it's life cycle. We have to be able to differentiate in 
amdgpu_fence_emit
between first ever call where we init the embedded fence's refcount from 
scratch using kref_init
to repeating calls when refcount already > 0 and we just fake increase 
the refcount to align
behavior with pointer style fences in other drivers.

I guess we could assume that embedded fence is all zeroes before first 
dma_fence_init  if assuming the job itself
was allocated using kzalloc and so u can look at dma_fence_ops == NULL 
or maybe seqno == 0
as a hint if that the fist call or not but it's a risky assumption in my 
opinion.

Andrey

>
>>
>> Also since in the previous patch we resumed setting s_fence->parent 
>> to NULL
>> in drm_sched_stop switch to directly checking if job->hw_fence is
>> signaled to short circuit reset if already signed.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
>> Tested-by: Yiqing Yao <yiqing.yao at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++++++++------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  |  7 ++++++-
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  4 ----
>>   4 files changed, 25 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 513c57f839d8..447bd92c4856 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -684,6 +684,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device 
>> *adev,
>>           goto err_ib_sched;
>>       }
>>   +    /* Drop the initial kref_init count (see drm_sched_main as 
>> example) */
>> +    dma_fence_put(f);
>>       ret = dma_fence_wait(f, false);
>>     err_ib_sched:
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index c99541685804..f9718119834f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5009,16 +5009,28 @@ static void amdgpu_device_recheck_guilty_jobs(
>>             /* clear job's guilty and depend the folowing step to 
>> decide the real one */
>>           drm_sched_reset_karma(s_job);
>> -        /* for the real bad job, it will be resubmitted twice, 
>> adding a dma_fence_get
>> -         * to make sure fence is balanced */
>> -        dma_fence_get(s_job->s_fence->parent);
>>           drm_sched_resubmit_jobs_ext(&ring->sched, 1);
>>   +        if (!s_job->s_fence->parent) {
>> +            DRM_WARN("Failed to get a HW fence for job!");
>> +            continue;
>> +        }
>> +
>>           ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, 
>> ring->sched.timeout);
>>           if (ret == 0) { /* timeout */
>>               DRM_ERROR("Found the real bad job! ring:%s, 
>> job_id:%llx\n",
>>                           ring->sched.name, s_job->id);
>>   +
>> +            /* Clear this failed job from fence array */
>> +            amdgpu_fence_driver_clear_job_fences(ring);
>> +
>> +            /* Since the job won't signal and we go for
>> +             * another resubmit drop this parent pointer
>> +             */
>> +            dma_fence_put(s_job->s_fence->parent);
>> +            s_job->s_fence->parent = NULL;
>> +
>>               /* set guilty */
>>               drm_sched_increase_karma(s_job);
>>   retry:
>> @@ -5047,7 +5059,6 @@ static void amdgpu_device_recheck_guilty_jobs(
>>             /* got the hw fence, signal finished fence */
>>           atomic_dec(ring->sched.score);
>> -        dma_fence_put(s_job->s_fence->parent);
>>           dma_fence_get(&s_job->s_fence->finished);
>>           dma_fence_signal(&s_job->s_fence->finished);
>>           dma_fence_put(&s_job->s_fence->finished);
>> @@ -5220,8 +5231,8 @@ int amdgpu_device_gpu_recover(struct 
>> amdgpu_device *adev,
>>        *
>>        * job->base holds a reference to parent fence
>>        */
>> -    if (job && job->base.s_fence->parent &&
>> -        dma_fence_is_signaled(job->base.s_fence->parent)) {
>> +    if (job && (job->hw_fence.ops != NULL) &&
>> +        dma_fence_is_signaled(&job->hw_fence)) {
>>           job_signaled = true;
>>           dev_info(adev->dev, "Guilty job already signaled, skipping 
>> HW reset");
>>           goto skip_hw_reset;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index d6d54ba4c185..9bd4e18212fc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -164,11 +164,16 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, 
>> struct dma_fence **f, struct amd
>>       if (job && job->job_run_counter) {
>>           /* reinit seq for resubmitted jobs */
>>           fence->seqno = seq;
>> +        /* TO be inline with external fence creation and other 
>> drivers */
>> +        dma_fence_get(fence);
>>       } else {
>> -        if (job)
>> +        if (job) {
>>               dma_fence_init(fence, &amdgpu_job_fence_ops,
>>                          &ring->fence_drv.lock,
>>                          adev->fence_context + ring->idx, seq);
>> +            /* Against remove in amdgpu_job_{free, free_cb} */
>> +            dma_fence_get(fence);
>> +        }
>>           else
>>               dma_fence_init(fence, &amdgpu_fence_ops,
>>                          &ring->fence_drv.lock,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 58568fdde2d0..638e1d600258 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -267,10 +267,6 @@ static struct dma_fence *amdgpu_job_run(struct 
>> drm_sched_job *sched_job)
>>               DRM_ERROR("Error scheduling IBs (%d)\n", r);
>>       }
>>   -    if (!job->job_run_counter)
>> -        dma_fence_get(fence);
>> -    else if (finished->error < 0)
>> -        dma_fence_put(&job->hw_fence);
>>       job->job_run_counter++;
>>       amdgpu_job_free_resources(job);
>