[PATCH 1/2] drm/amdgpu: Fix unmap queue logic
Ma, Le
Le.Ma at amd.com
Tue Nov 5 06:15:31 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Lijo Lazar
> Sent: Tuesday, November 5, 2024 1:39 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: Fix unmap queue logic
>
> In current logic, it calls ring_alloc followed by a ring_test. ring_test in turn will call
> another ring_alloc. This is illegal usage as a ring_alloc is expected to be closed
> properly with a ring_commit. Change to commit the unmap queue packet first
> followed by a ring_test. Add a comment about the usage of ring_test.
Regarding the description " This is illegal usage as a ring_alloc is expected to be closed properly with a ring_commit ", does this only apply to unmap queue logic ?
The current logic to do map queue is also to commit once in ring_test but ring_alloc twice.
>
> Also, reorder the current pre-condition checks of job hang or kiq ring scheduler not
> ready. Without them being met, it is not useful to attempt ring or memory allocations.
>
> Fixes tag refers to the original patch which introduced this issue which then got
> carried over into newer code.
>
> Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
>
> Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 13 +++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 47 ++++++++++++++--------
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 7 ++++
> 3 files changed, 49 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index c218e8f117e4..2a40150ae10f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -844,6 +844,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device
> *adev, u32 doorbell_off,
> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> return -EINVAL;
>
> + if (!kiq_ring->sched.ready || adev->job_hang)
> + return 0;
> +
> ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
> if (!ring_funcs)
> return -ENOMEM;
> @@ -868,8 +871,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device
> *adev, u32 doorbell_off,
>
> kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);
>
> - if (kiq_ring->sched.ready && !adev->job_hang)
> - r = amdgpu_ring_test_helper(kiq_ring);
> + /* Submit unmap queue packet */
> + amdgpu_ring_commit(kiq_ring);
> + /*
> + * Ring test will do a basic scratch register change check. Just run
> + * this to ensure that unmap queues that is submitted before got
> + * processed successfully before returning.
> + */
> + r = amdgpu_ring_test_helper(kiq_ring);
>
> spin_unlock(&kiq->ring_lock);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index dabc01cf1fbb..6733ff5d9628 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev,
> int xcc_id)
> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> return -EINVAL;
>
> + if (!kiq_ring->sched.ready || adev->job_hang)
> + return 0;
> + /**
> + * This is workaround: only skip kiq_ring test
> + * during ras recovery in suspend stage for gfx9.4.3
> + */
> + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
> + amdgpu_ras_in_recovery(adev))
> + return 0;
> +
> spin_lock(&kiq->ring_lock);
> if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
> adev->gfx.num_compute_rings)) {
> @@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device
> *adev, int xcc_id)
> &adev->gfx.compute_ring[j],
> RESET_QUEUES, 0, 0);
> }
> -
> - /**
> - * This is workaround: only skip kiq_ring test
> - * during ras recovery in suspend stage for gfx9.4.3
> + /* Submit unmap queue packet */
> + amdgpu_ring_commit(kiq_ring);
> + /*
> + * Ring test will do a basic scratch register change check. Just run
> + * this to ensure that unmap queues that is submitted before got
> + * processed successfully before returning.
> */
> - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
> - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
> - amdgpu_ras_in_recovery(adev)) {
> - spin_unlock(&kiq->ring_lock);
> - return 0;
> - }
> + r = amdgpu_ring_test_helper(kiq_ring);
>
> - if (kiq_ring->sched.ready && !adev->job_hang)
> - r = amdgpu_ring_test_helper(kiq_ring);
> spin_unlock(&kiq->ring_lock);
>
> return r;
> @@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev,
> int xcc_id)
> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> return -EINVAL;
>
> - spin_lock(&kiq->ring_lock);
> + if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
> + return 0;
> +
> if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
> + spin_lock(&kiq->ring_lock);
> if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
> adev->gfx.num_gfx_rings)) {
> spin_unlock(&kiq->ring_lock);
> @@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device
> *adev, int xcc_id)
> &adev->gfx.gfx_ring[j],
> PREEMPT_QUEUES, 0, 0);
> }
> - }
> + /* Submit unmap queue packet */
> + amdgpu_ring_commit(kiq_ring);
>
> - if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang)
> + /*
> + * Ring test will do a basic scratch register change check.
> + * Just run this to ensure that unmap queues that is submitted
> + * before got processed successfully before returning.
> + */
> r = amdgpu_ring_test_helper(kiq_ring);
> - spin_unlock(&kiq->ring_lock);
> + spin_unlock(&kiq->ring_lock);
> + }
>
> return r;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index f85e545653c7..553a6113fa67 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device
> *adev)
> amdgpu_ring_write(kiq_ring, 0);
> amdgpu_ring_write(kiq_ring, 0);
> }
> + /* Submit unmap queue packet */
> + amdgpu_ring_commit(kiq_ring);
> + /*
> + * Ring test will do a basic scratch register change check. Just run
> + * this to ensure that unmap queues that is submitted before got
> + * processed successfully before returning.
> + */
> r = amdgpu_ring_test_helper(kiq_ring);
> if (r)
> DRM_ERROR("KCQ disable failed\n");
> --
> 2.25.1
More information about the amd-gfx
mailing list