[PATCH v2] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
Alex Deucher
alexdeucher at gmail.com
Tue Aug 20 13:50:29 UTC 2024
On Tue, Aug 20, 2024 at 8:43 AM Lazar, Lijo <lijo.lazar at amd.com> wrote:
>
>
>
> On 8/20/2024 4:01 PM, Prike Liang wrote:
> > Implement the compute pipe reset and driver will
> > fallback to pipe reset when queue reset failed.
> >
> > Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> > ---
> > v2: Convert the GC logic instance to physical instance in the
> > register accessing process and
>
> > use the dev_* print to specify the failed device.
>
> This is not fully done, marked below.
>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 +
> > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153 ++++++++++++++++++++----
> > 2 files changed, 138 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index e28c1ebfa98f..d4d74ba2bc27 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
> > uint32_t queue_type, uint32_t me_id,
> > uint32_t pipe_id, uint32_t queue_id,
> > uint32_t xcc_id, uint32_t vmid);
> > + int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> > + uint32_t queue_type, uint32_t me,
> > + uint32_t pipe, uint32_t queue,
> > + uint32_t xcc_id);
> > +
> > /* Packet sizes */
> > int set_resources_size;
> > int map_queues_size;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > index 2067f26d3a9d..ab9d5adbbfe8 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> > @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
> > struct amdgpu_cu_info *cu_info);
> > static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id);
> > static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id);
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > + uint32_t queue_type, uint32_t me,
> > + uint32_t pipe, uint32_t queue,
> > + uint32_t xcc_id);
> >
> > static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
> > uint64_t queue_mask)
> > @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = {
> > .kiq_query_status = gfx_v9_4_3_kiq_query_status,
> > .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
> > .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> > + .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
> > .set_resources_size = 8,
> > .map_queues_size = 7,
> > .unmap_queues_size = 6,
> > @@ -3466,6 +3471,115 @@ static void gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable)
> > }
> > }
> >
> > +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
> > + uint32_t pipe, uint32_t queue,
> > + uint32_t xcc_id)
> > +{
> > + int i, r;
> > + /* make sure dequeue is complete*/
> > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > + mutex_lock(&adev->srbm_mutex);
> > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > + for (i = 0; i < adev->usec_timeout; i++) {
> > + if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1))
> > + break;
> > + udelay(1);
> > + }
> > + if (i >= adev->usec_timeout)
> > + r = -ETIMEDOUT;
> > + else
> > + r = 0;
> > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > + mutex_unlock(&adev->srbm_mutex);
> > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > + return r;
> > +
> > +}
> > +
> > +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> > +{
> > +
> > + if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) {
> > + DRM_WARN_ONCE("MEC firmware version too old, please use FW no older than 155!\n");
> > + return false;
> > + }
>
> This path will be taken GCv9.4.3 and GCv9.4.4. GCv9.4.4 has a different
> FW version. If FW is not yet ready for 9.4.4, better check that and
> return false for that.
>
> > +
> > + return true;
> > +}
> > +
> > +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> > + uint32_t queue_type, uint32_t me,
> > + uint32_t pipe, uint32_t queue,
> > + uint32_t xcc_id)
> > +{
> > + struct amdgpu_device *adev = kiq_ring->adev;
> > + uint32_t reset_pipe, clean_pipe;
> > + int r;
> > +
> > + if (!gfx_v9_4_3_pipe_reset_support(adev))
> > + return -EINVAL;
> > +
> > + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> > + mutex_lock(&adev->srbm_mutex);
> > + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> > +
> > + reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL);
> > + clean_pipe = reset_pipe;
> > +
> > + if (me == 1) {
> > + switch (pipe) {
> > + case 0:
> > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE0_RESET, 1);
> > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE0_RESET, 0);
> > + break;
> > + case 1:
> > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE1_RESET, 1);
> > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE1_RESET, 0);
> > + break;
> > + case 2:
> > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE2_RESET, 1);
> > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE2_RESET, 0);
> > + break;
> > + case 3:
> > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE3_RESET, 1);
> > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> > + MEC_ME1_PIPE3_RESET, 0);
> > + break;
> > + default:
> > + break;
> > + }
> > + } else {
> > + if (pipe) {
> > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> > + MEC_ME2_PIPE1_RESET, 1);
> > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> > + MEC_ME2_PIPE1_RESET, 0);
> > + } else {
> > + reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> > + MEC_ME2_PIPE0_RESET, 1);
> > + clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> > + MEC_ME2_PIPE0_RESET, 0);
> > + }
> > + }
> > +
> > + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, reset_pipe);
> > + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, clean_pipe);
> > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> > + mutex_unlock(&adev->srbm_mutex);
> > + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> > +
> > + r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> > + return r;
> > +}
> > +
> > static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> > unsigned int vmid)
> > {
> > @@ -3473,7 +3587,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> > struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
> > struct amdgpu_ring *kiq_ring = &kiq->ring;
> > unsigned long flags;
> > - int r, i;
> > + int r;
> >
> > if (amdgpu_sriov_vf(adev))
> > return -EINVAL;
> > @@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> > spin_unlock_irqrestore(&kiq->ring_lock, flags);
> >
> > r = amdgpu_ring_test_ring(kiq_ring);
> > - if (r)
> > - return r;
> > -
> > - /* make sure dequeue is complete*/
> > - amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> > - mutex_lock(&adev->srbm_mutex);
> > - soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, ring->xcc_id));
> > - for (i = 0; i < adev->usec_timeout; i++) {
> > - if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> > - break;
> > - udelay(1);
> > - }
> > - if (i >= adev->usec_timeout)
> > - r = -ETIMEDOUT;
> > - soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> > - mutex_unlock(&adev->srbm_mutex);
> > - amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> > if (r) {
> > - dev_err(adev->dev, "fail to wait on hqd deactive\n");
> > - return r;
> > + DRM_ERROR("kiq ring test failed after ring: %s queue reset\n",
> > + ring->name);
>
> dev_err here
>
> > + goto pipe_reset;
> > + }
> > +
> > + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id);
> > + if (r)
> > + dev_err(adev->dev,"fail to wait on hqd deactive and will try pipe reset\n");
> > +
> > +pipe_reset:
> > + if(r) {
> > + r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type,
> > + ring->me, ring->pipe,
> > + ring->queue, ring->xcc_id);
>
> A side question about this - what's the expectation about the other
> queue(s) on this pipe at this point? Are they expected to be also hung
> at this point?
According to the firmware team, the pipe will only have one queue
executing on it at a given time, so resetting the pipe will only reset
that queue.
Alex
>
> In short, checking if this should be done from a higher level code or
> from here itself. Ex:
>
> Queue reset -> Failed -> Try dequeue of all other active queues -> Do a
> pipe reset
>
> > + DRM_INFO("ring: %s pipe reset :%s\n", ring->name,
> > + r ? "failed" : "successfully");
> dev_info here
>
> Thanks,
> Lijo
>
> > + if (r)
> > + return r;
> > }
> >
> > r = amdgpu_bo_reserve(ring->mqd_obj, false);
More information about the amd-gfx
mailing list