[PATCH 1/2] drm/amdgpu: Fix unmap queue logic

Tue Nov 5 06:22:56 UTC 2024

On 11/5/2024 11:45 AM, Ma, Le wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Lijo Lazar
>> Sent: Tuesday, November 5, 2024 1:39 PM
>> To: amd-gfx at lists.freedesktop.org
>> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander
>> <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
>> Subject: [PATCH 1/2] drm/amdgpu: Fix unmap queue logic
>>
>> In current logic, it calls ring_alloc followed by a ring_test. ring_test in turn will call
>> another ring_alloc. This is illegal usage as a ring_alloc is expected to be closed
>> properly with a ring_commit. Change to commit the unmap queue packet first
>> followed by a ring_test. Add a comment about the usage of ring_test.
> 
> Regarding the description " This is illegal usage as a ring_alloc is expected to be closed properly with a ring_commit ", does this only apply to unmap queue logic ?
> The current logic to do map queue is also to commit once in ring_test but ring_alloc twice.
> 

Should be applicable for this case also - ring_alloc calls begin_use and
ring_commit marks end_use. It could be working fine because those are
not implemented for these rings currently.

But using ring_test without a commit doesn't appear to be the correct
usage of API.

Will fix map calls in another patch.

Thanks,
Lijo
>>
>> Also, reorder the current pre-condition checks of job hang or kiq ring scheduler not
>> ready. Without them being met, it is not useful to attempt ring or memory allocations.
>>
>> Fixes tag refers to the original patch which introduced this issue which then got
>> carried over into newer code.
>>
>> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
>>
>> Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 13 +++++-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 47 ++++++++++++++--------
>>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c      |  7 ++++
>>  3 files changed, 49 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index c218e8f117e4..2a40150ae10f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -844,6 +844,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device
>> *adev, u32 doorbell_off,
>>       if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
>>               return -EINVAL;
>>
>> +     if (!kiq_ring->sched.ready || adev->job_hang)
>> +             return 0;
>> +
>>       ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
>>       if (!ring_funcs)
>>               return -ENOMEM;
>> @@ -868,8 +871,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device
>> *adev, u32 doorbell_off,
>>
>>       kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);
>>
>> -     if (kiq_ring->sched.ready && !adev->job_hang)
>> -             r = amdgpu_ring_test_helper(kiq_ring);
>> +     /* Submit unmap queue packet */
>> +     amdgpu_ring_commit(kiq_ring);
>> +     /*
>> +      * Ring test will do a basic scratch register change check. Just run
>> +      * this to ensure that unmap queues that is submitted before got
>> +      * processed successfully before returning.
>> +      */
>> +     r = amdgpu_ring_test_helper(kiq_ring);
>>
>>       spin_unlock(&kiq->ring_lock);
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index dabc01cf1fbb..6733ff5d9628 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev,
>> int xcc_id)
>>       if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
>>               return -EINVAL;
>>
>> +     if (!kiq_ring->sched.ready || adev->job_hang)
>> +             return 0;
>> +     /**
>> +      * This is workaround: only skip kiq_ring test
>> +      * during ras recovery in suspend stage for gfx9.4.3
>> +      */
>> +     if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
>> +          amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
>> +         amdgpu_ras_in_recovery(adev))
>> +             return 0;
>> +
>>       spin_lock(&kiq->ring_lock);
>>       if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
>>                                       adev->gfx.num_compute_rings)) {
>> @@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device
>> *adev, int xcc_id)
>>                                          &adev->gfx.compute_ring[j],
>>                                          RESET_QUEUES, 0, 0);
>>       }
>> -
>> -     /**
>> -      * This is workaround: only skip kiq_ring test
>> -      * during ras recovery in suspend stage for gfx9.4.3
>> +     /* Submit unmap queue packet */
>> +     amdgpu_ring_commit(kiq_ring);
>> +     /*
>> +      * Ring test will do a basic scratch register change check. Just run
>> +      * this to ensure that unmap queues that is submitted before got
>> +      * processed successfully before returning.
>>        */
>> -     if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
>> -         amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
>> -         amdgpu_ras_in_recovery(adev)) {
>> -             spin_unlock(&kiq->ring_lock);
>> -             return 0;
>> -     }
>> +     r = amdgpu_ring_test_helper(kiq_ring);
>>
>> -     if (kiq_ring->sched.ready && !adev->job_hang)
>> -             r = amdgpu_ring_test_helper(kiq_ring);
>>       spin_unlock(&kiq->ring_lock);
>>
>>       return r;
>> @@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev,
>> int xcc_id)
>>       if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
>>               return -EINVAL;
>>
>> -     spin_lock(&kiq->ring_lock);
>> +     if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
>> +             return 0;
>> +
>>       if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
>> +             spin_lock(&kiq->ring_lock);
>>               if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
>>                                               adev->gfx.num_gfx_rings)) {
>>                       spin_unlock(&kiq->ring_lock);
>> @@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device
>> *adev, int xcc_id)
>>                                                  &adev->gfx.gfx_ring[j],
>>                                                  PREEMPT_QUEUES, 0, 0);
>>               }
>> -     }
>> +             /* Submit unmap queue packet */
>> +             amdgpu_ring_commit(kiq_ring);
>>
>> -     if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang)
>> +             /*
>> +              * Ring test will do a basic scratch register change check.
>> +              * Just run this to ensure that unmap queues that is submitted
>> +              * before got processed successfully before returning.
>> +              */
>>               r = amdgpu_ring_test_helper(kiq_ring);
>> -     spin_unlock(&kiq->ring_lock);
>> +             spin_unlock(&kiq->ring_lock);
>> +     }
>>
>>       return r;
>>  }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index f85e545653c7..553a6113fa67 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device
>> *adev)
>>               amdgpu_ring_write(kiq_ring, 0);
>>               amdgpu_ring_write(kiq_ring, 0);
>>       }
>> +     /* Submit unmap queue packet */
>> +     amdgpu_ring_commit(kiq_ring);
>> +     /*
>> +      * Ring test will do a basic scratch register change check. Just run
>> +      * this to ensure that unmap queues that is submitted before got
>> +      * processed successfully before returning.
>> +      */
>>       r = amdgpu_ring_test_helper(kiq_ring);
>>       if (r)
>>               DRM_ERROR("KCQ disable failed\n");
>> --
>> 2.25.1
>