[PATCH] drm/amdkfd: fallback to pipe reset on queue reset fail for gfx9
Alex Deucher
alexdeucher at gmail.com
Fri Aug 9 15:54:37 UTC 2024
On Fri, Aug 2, 2024 at 12:38 PM Jonathan Kim <Jonathan.Kim at amd.com> wrote:
>
> If queue reset fails, tell the CP to reset the pipe.
> Since queues multiplex context per pipe and we've issues a device wide
> preemption prior to the hang, we can assume the hung pipe only has one
> queue to reset on pipe reset.
Is there a specific CP or PSP firmware version required for this? If
so, we should check for it before attempting this if it will cause a
problem.
Other than that:
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 46 +++++++++++++------
> 1 file changed, 31 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 32f28c12077b..c63528a4e894 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -1173,12 +1173,30 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
> return queue_addr;
> }
>
> +/* assume queue acquired */
> +static int kgd_gfx_v9_hqd_dequeue_wait(struct amdgpu_device *adev, uint32_t inst,
> + unsigned int utimeout)
> +{
> + unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
> +
> + while (true) {
> + uint32_t temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
> +
> + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
> + return 0;
> +
> + if (time_after(jiffies, end_jiffies))
> + return -ETIME;
> +
> + usleep_range(500, 1000);
> + }
> +}
> +
> uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
> uint32_t pipe_id, uint32_t queue_id,
> uint32_t inst, unsigned int utimeout)
> {
> - uint32_t low, high, temp;
> - unsigned long end_jiffies;
> + uint32_t low, high, pipe_reset_data = 0;
> uint64_t queue_addr = 0;
>
> kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
> @@ -1202,25 +1220,23 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
> /* assume previous dequeue request issued will take affect after reset */
> WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1);
>
> - end_jiffies = (utimeout * HZ / 1000) + jiffies;
> - while (true) {
> - temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
> + if (!kgd_gfx_v9_hqd_dequeue_wait(adev, inst, utimeout))
> + goto unlock_out;
>
> - if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
> - break;
> + pr_debug("Attempting pipe reset on XCC %i pipe id %i\n", inst, pipe_id);
>
> - if (time_after(jiffies, end_jiffies)) {
> - queue_addr = 0;
> - break;
> - }
> + pipe_reset_data = REG_SET_FIELD(pipe_reset_data, CP_MEC_CNTL, MEC_ME1_PIPE0_RESET, 1);
> + pipe_reset_data = pipe_reset_data << pipe_id;
>
> - usleep_range(500, 1000);
> - }
> + WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_MEC_CNTL, pipe_reset_data);
> + WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_MEC_CNTL, 0);
>
> - pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
> - inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
> + if (kgd_gfx_v9_hqd_dequeue_wait(adev, inst, utimeout))
> + queue_addr = 0;
>
> unlock_out:
> + pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
> + inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
> amdgpu_gfx_rlc_exit_safe_mode(adev, inst);
> kgd_gfx_v9_release_queue(adev, inst);
>
> --
> 2.34.1
>
More information about the amd-gfx
mailing list