[PATCH] drm/amdkfd: fallback to pipe reset on queue reset fail for gfx9

Alex Deucher alexdeucher at gmail.com
Fri Aug 9 15:54:37 UTC 2024


On Fri, Aug 2, 2024 at 12:38 PM Jonathan Kim <Jonathan.Kim at amd.com> wrote:
>
> If queue reset fails, tell the CP to reset the pipe.
> Since queues multiplex context per pipe and we've issues a device wide
> preemption prior to the hang, we can assume the hung pipe only has one
> queue to reset on pipe reset.

Is there a specific CP or PSP firmware version required for this?  If
so, we should check for it before attempting this if it will cause a
problem.

Other than that:
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>


>
> Signed-off-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 46 +++++++++++++------
>  1 file changed, 31 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 32f28c12077b..c63528a4e894 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -1173,12 +1173,30 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
>         return queue_addr;
>  }
>
> +/* assume queue acquired  */
> +static int kgd_gfx_v9_hqd_dequeue_wait(struct amdgpu_device *adev, uint32_t inst,
> +                                      unsigned int utimeout)
> +{
> +       unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
> +
> +       while (true) {
> +               uint32_t temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
> +
> +               if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
> +                       return 0;
> +
> +               if (time_after(jiffies, end_jiffies))
> +                       return -ETIME;
> +
> +               usleep_range(500, 1000);
> +       }
> +}
> +
>  uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
>                               uint32_t pipe_id, uint32_t queue_id,
>                               uint32_t inst, unsigned int utimeout)
>  {
> -       uint32_t low, high, temp;
> -       unsigned long end_jiffies;
> +       uint32_t low, high, pipe_reset_data = 0;
>         uint64_t queue_addr = 0;
>
>         kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
> @@ -1202,25 +1220,23 @@ uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev,
>         /* assume previous dequeue request issued will take affect after reset */
>         WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1);
>
> -       end_jiffies = (utimeout * HZ / 1000) + jiffies;
> -       while (true) {
> -               temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
> +       if (!kgd_gfx_v9_hqd_dequeue_wait(adev, inst, utimeout))
> +               goto unlock_out;
>
> -               if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
> -                       break;
> +       pr_debug("Attempting pipe reset on XCC %i pipe id %i\n", inst, pipe_id);
>
> -               if (time_after(jiffies, end_jiffies)) {
> -                       queue_addr = 0;
> -                       break;
> -               }
> +       pipe_reset_data = REG_SET_FIELD(pipe_reset_data, CP_MEC_CNTL, MEC_ME1_PIPE0_RESET, 1);
> +       pipe_reset_data = pipe_reset_data << pipe_id;
>
> -               usleep_range(500, 1000);
> -       }
> +       WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_MEC_CNTL, pipe_reset_data);
> +       WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_MEC_CNTL, 0);
>
> -       pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
> -                inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
> +       if (kgd_gfx_v9_hqd_dequeue_wait(adev, inst, utimeout))
> +               queue_addr = 0;
>
>  unlock_out:
> +       pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n",
> +                inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!");
>         amdgpu_gfx_rlc_exit_safe_mode(adev, inst);
>         kgd_gfx_v9_release_queue(adev, inst);
>
> --
> 2.34.1
>


More information about the amd-gfx mailing list