[PATCH v3] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

Wed Aug 28 03:18:10 UTC 2024

[AMD Official Use Only - AMD Internal Distribution Only]

> From: Lazar, Lijo <Lijo.Lazar at amd.com>
> Sent: Tuesday, August 27, 2024 4:02 PM
> To: Liang, Prike <Prike.Liang at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Ma, Le
> <Le.Ma at amd.com>
> Subject: Re: [PATCH v3] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
>
>
>
> On 8/22/2024 3:08 PM, Prike Liang wrote:
> > Implement the compute pipe reset and driver will fallback to pipe
> > reset when queue reset failed.
> >
> > Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> > ---
> > v3: Use the dev log and filer out the gfx9.4.4 pipe reset support.
> > v2: Convert the GC logic instance to physical instance in the
> >     register accessing process and use the dev_* print to specify
> >     the failed device.
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 154
> > +++++++++++++++++++++---
> >  2 files changed, 139 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index e28c1ebfa98f..d4d74ba2bc27 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
> >                                uint32_t queue_type, uint32_t me_id,
> >                                uint32_t pipe_id, uint32_t queue_id,
> >                                uint32_t xcc_id, uint32_t vmid);
> > +   int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> > +                              uint32_t queue_type, uint32_t me,
> > +                              uint32_t pipe, uint32_t queue,
> > +                              uint32_t xcc_id);
> > +
> >     /* Packet sizes */
> >     int set_resources_size;
> >     int map_queues_size;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > index 2067f26d3a9d..aa0c76eed452 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct
> amdgpu_device *adev,
> >                             struct amdgpu_cu_info *cu_info);
> >  static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev,
> > int xcc_id);  static void gfx_v9_4_3_xcc_unset_safe_mode(struct
> > amdgpu_device *adev, int xcc_id);
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > +                                   uint32_t queue_type, uint32_t me,
> > +                                   uint32_t pipe, uint32_t queue,
> > +                                   uint32_t xcc_id);
> >
> >  static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
> >                             uint64_t queue_mask)
> > @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs
> gfx_v9_4_3_kiq_pm4_funcs = {
> >     .kiq_query_status = gfx_v9_4_3_kiq_query_status,
> >     .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
> >     .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> > +   .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
> >     .set_resources_size = 8,
> >     .map_queues_size = 7,
> >     .unmap_queues_size = 6,
> > @@ -3466,6 +3471,116 @@ static void gfx_v9_4_3_emit_wave_limit(struct
> amdgpu_ring *ring, bool enable)
> >     }
> >  }
> >
> > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t
> me,
> > +                           uint32_t pipe, uint32_t queue,
> > +                           uint32_t xcc_id)
> > +{
> > +   int i, r;
> > +   /* make sure dequeue is complete*/
> > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +   mutex_lock(&adev->srbm_mutex);
> > +   soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > +   for (i = 0; i < adev->usec_timeout; i++) {
> > +           if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regCP_HQD_ACTIVE) & 1))
> > +                   break;
> > +           udelay(1);
> > +   }
> > +   if (i >= adev->usec_timeout)
> > +           r = -ETIMEDOUT;
> > +   else
> > +           r = 0;
> > +   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > +   mutex_unlock(&adev->srbm_mutex);
> > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > +   return r;
> > +
> > +}
> > +
> > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> > +{
> > +   /*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as
> well.*/
> > +   if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> > +                   adev->gfx.mec_fw_version >= 0x0000009b)
> > +           return true;
> > +   else
> > +           dev_warn_once(adev->dev, "Please use the latest MEC
> version to see
> > +whether support pipe reset\n");
> > +
> > +   return false;
> > +}
> > +
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > +                                   uint32_t queue_type, uint32_t me,
> > +                                   uint32_t pipe, uint32_t queue,
> > +                                   uint32_t xcc_id)
> > +{
> > +   struct amdgpu_device *adev = kiq_ring->adev;
> > +   uint32_t reset_pipe, clean_pipe;
> > +   int r;
> > +
> > +   if (!gfx_v9_4_3_pipe_reset_support(adev))
> > +           return -EINVAL;
> > +
> > +   gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > +   mutex_lock(&adev->srbm_mutex);
> > +   soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
>
> MEC_CNTL is not a per-queue register. So not sure if selecting this makes
> sense here. The mutex may be taken so that resets for other queues in the
> same XCC don't run in parallel.
>
From the CP HW team, it is shown that there is no need to select grbm for
setting the pipe reset field, and it will be removed.

> > +
> > +   reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id),
> regCP_MEC_CNTL);
> > +   clean_pipe = reset_pipe;
>
> I think the saved value could be written back (skipping the SET_FIELD steps
> below) as the only change that's done here is to set the PIPEx_RESET field.
>
Yes, it seems that using the variable clean_pipe to save the original value will be okay for cleaning up
the pipe reset field, as there is a mutex lock to ensure that the pipe reset field is mutually set.
It can simplify the code, but it does not appear to be straightforward to set and clean the pipe reset field.

Thanks,
Prike
> Apart from those, looks fine.
>
> Thanks,
> Lijo
>
> > +
> > +   if (me == 1) {
> > +           switch (pipe) {
> > +           case 0:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE0_RESET, 1);
> > +                   clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE0_RESET, 0);
> > +                   break;
> > +           case 1:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE1_RESET, 1);
> > +                   clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE1_RESET, 0);
> > +                   break;
> > +           case 2:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE2_RESET, 1);
> > +                   clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE2_RESET, 0);
> > +                   break;
> > +           case 3:
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE3_RESET, 1);
> > +                   clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME1_PIPE3_RESET, 0);
> > +                   break;
> > +           default:
> > +                   break;
> > +           }
> > +   } else {
> > +           if (pipe) {
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME2_PIPE1_RESET, 1);
> > +                   clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME2_PIPE1_RESET, 0);
> > +           } else {
> > +                   reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME2_PIPE0_RESET, 1);
> > +                   clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> > +                                              MEC_ME2_PIPE0_RESET, 0);
> > +           }
> > +   }
> > +
> > +   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> reset_pipe);
> > +   WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> clean_pipe);
> > +   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > +   mutex_unlock(&adev->srbm_mutex);
> > +   gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > +   r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> > +   return r;
> > +}
> > +
> >  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> >                             unsigned int vmid)
> >  {
> > @@ -3473,7 +3588,7 @@ static int gfx_v9_4_3_reset_kcq(struct
> amdgpu_ring *ring,
> >     struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
> >     struct amdgpu_ring *kiq_ring = &kiq->ring;
> >     unsigned long flags;
> > -   int r, i;
> > +   int r;
> >
> >     if (amdgpu_sriov_vf(adev))
> >             return -EINVAL;
> > @@ -3495,26 +3610,25 @@ static int gfx_v9_4_3_reset_kcq(struct
> amdgpu_ring *ring,
> >     spin_unlock_irqrestore(&kiq->ring_lock, flags);
> >
> >     r = amdgpu_ring_test_ring(kiq_ring);
> > -   if (r)
> > -           return r;
> > -
> > -   /* make sure dequeue is complete*/
> > -   amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> > -   mutex_lock(&adev->srbm_mutex);
> > -   soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0,
> GET_INST(GC, ring->xcc_id));
> > -   for (i = 0; i < adev->usec_timeout; i++) {
> > -           if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> > -                   break;
> > -           udelay(1);
> > -   }
> > -   if (i >= adev->usec_timeout)
> > -           r = -ETIMEDOUT;
> > -   soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> > -   mutex_unlock(&adev->srbm_mutex);
> > -   amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> >     if (r) {
> > -           dev_err(adev->dev, "fail to wait on hqd deactive\n");
> > -           return r;
> > +           dev_err(adev->dev, "kiq ring test failed after ring: %s queue
> reset\n",
> > +                           ring->name);
> > +           goto pipe_reset;
> > +   }
> > +
> > +   r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring-
> >queue, ring->xcc_id);
> > +   if (r)
> > +           dev_err(adev->dev, "fail to wait on hqd deactive and will try
> pipe
> > +reset\n");
> > +
> > +pipe_reset:
> > +   if(r) {
> > +           r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs-
> >type,
> > +                                           ring->me, ring->pipe,
> > +                                           ring->queue, ring->xcc_id);
> > +           dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
> > +                           r ? "failed" : "successfully");
> > +           if (r)
> > +                   return r;
> >     }
> >
> >     r = amdgpu_bo_reserve(ring->mqd_obj, false);