[PATCH] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
Ma, Le
Le.Ma at amd.com
Tue Aug 20 09:37:34 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Prike
> Liang
> Sent: Tuesday, August 20, 2024 4:58 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Liang, Prike
> <Prike.Liang at amd.com>
> Subject: [PATCH] drm/amdgpu/gfx9.4.3: Implement compute pipe reset
>
> Implement the compute pipe reset and driver will fallback to pipe reset when
> queue reset failed.
>
> Signed-off-by: Prike Liang <Prike.Liang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 +
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153 ++++++++++++++++++++----
> 2 files changed, 138 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index e28c1ebfa98f..d4d74ba2bc27 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
> uint32_t queue_type, uint32_t me_id,
> uint32_t pipe_id, uint32_t queue_id,
> uint32_t xcc_id, uint32_t vmid);
> + int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> + uint32_t queue_type, uint32_t me,
> + uint32_t pipe, uint32_t queue,
> + uint32_t xcc_id);
> +
> /* Packet sizes */
> int set_resources_size;
> int map_queues_size;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 2067f26d3a9d..09caa5a1842b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct
> amdgpu_device *adev,
> struct amdgpu_cu_info *cu_info);
> static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int
> xcc_id); static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device
> *adev, int xcc_id);
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> + uint32_t queue_type, uint32_t me,
> + uint32_t pipe, uint32_t queue,
> + uint32_t xcc_id);
>
> static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
> uint64_t queue_mask)
> @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs
> gfx_v9_4_3_kiq_pm4_funcs = {
> .kiq_query_status = gfx_v9_4_3_kiq_query_status,
> .kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
> .kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> + .kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
> .set_resources_size = 8,
> .map_queues_size = 7,
> .unmap_queues_size = 6,
> @@ -3466,6 +3471,115 @@ static void gfx_v9_4_3_emit_wave_limit(struct
> amdgpu_ring *ring, bool enable)
> }
> }
>
> +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
> + uint32_t pipe, uint32_t queue,
> + uint32_t xcc_id)
> +{
> + int i, r;
> + /* make sure dequeue is complete*/
> + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> + mutex_lock(&adev->srbm_mutex);
> + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> + for (i = 0; i < adev->usec_timeout; i++) {
> + if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
Should GET_INST be used to locate the target xcc_id here?
> + break;
> + udelay(1);
> + }
> + if (i >= adev->usec_timeout)
> + r = -ETIMEDOUT;
> + else
> + r = 0;
> + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> + mutex_unlock(&adev->srbm_mutex);
> + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> + return r;
> +
> +}
> +
> +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev) {
> +
> + if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) {
> + DRM_WARN_ONCE("MEC firmware version too old, please use
> FW no older than 155!\n");
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> + uint32_t queue_type, uint32_t me,
> + uint32_t pipe, uint32_t queue,
> + uint32_t xcc_id)
> +{
> + struct amdgpu_device *adev = kiq_ring->adev;
> + uint32_t reset_pipe, clean_pipe;
> + int r;
> +
> + if (!gfx_v9_4_3_pipe_reset_support(adev))
> + return -EINVAL;
> +
> + gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> + mutex_lock(&adev->srbm_mutex);
> + soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> +
> + reset_pipe = RREG32_SOC15(GC, 0, regCP_MEC_CNTL);
Here the xcc_id is also missed to address.
> + clean_pipe = reset_pipe;
> +
> + if (me == 1) {
> + switch (pipe) {
> + case 0:
> + reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE0_RESET, 1);
> + clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE0_RESET, 0);
> + break;
> + case 1:
> + reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE1_RESET, 1);
> + clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE1_RESET, 0);
> + break;
> + case 2:
> + reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE2_RESET, 1);
> + clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE2_RESET, 0);
> + break;
> + case 3:
> + reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE3_RESET, 1);
> + clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> + MEC_ME1_PIPE3_RESET, 0);
> + break;
> + default:
> + break;
> + }
> + } else {
> + if (pipe) {
> + reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> + MEC_ME2_PIPE1_RESET, 1);
> + clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> + MEC_ME2_PIPE1_RESET, 0);
> + } else {
> + reset_pipe = REG_SET_FIELD(reset_pipe,
> CP_MEC_CNTL,
> + MEC_ME2_PIPE0_RESET, 1);
> + clean_pipe = REG_SET_FIELD(clean_pipe,
> CP_MEC_CNTL,
> + MEC_ME2_PIPE0_RESET, 0);
> + }
> + }
> +
> + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> reset_pipe);
> + WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
> clean_pipe);
> + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> + mutex_unlock(&adev->srbm_mutex);
> + gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> + r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> + return r;
> +}
> +
> static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
> unsigned int vmid)
> {
> @@ -3473,7 +3587,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring
> *ring,
> struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
> struct amdgpu_ring *kiq_ring = &kiq->ring;
> unsigned long flags;
> - int r, i;
> + int r;
>
> if (amdgpu_sriov_vf(adev))
> return -EINVAL;
> @@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct
> amdgpu_ring *ring,
> spin_unlock_irqrestore(&kiq->ring_lock, flags);
>
> r = amdgpu_ring_test_ring(kiq_ring);
> - if (r)
> - return r;
> -
> - /* make sure dequeue is complete*/
> - amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> - mutex_lock(&adev->srbm_mutex);
> - soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0,
> GET_INST(GC, ring->xcc_id));
> - for (i = 0; i < adev->usec_timeout; i++) {
> - if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> - break;
> - udelay(1);
> - }
> - if (i >= adev->usec_timeout)
> - r = -ETIMEDOUT;
> - soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> - mutex_unlock(&adev->srbm_mutex);
> - amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
> if (r) {
> - dev_err(adev->dev, "fail to wait on hqd deactive\n");
> - return r;
> + DRM_ERROR("kiq ring test failed after ring: %s queue reset\n",
> + ring->name);
Normally the dev_xxx print is used to trace the socket number where error happens.
Thanks.
> + goto pipe_reset;
> + }
> +
> + r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue,
> ring->xcc_id);
> + if (r)
> + DRM_ERROR("fail to wait on hqd deactive and will try pipe
> reset\n");
> +
> +pipe_reset:
> + if(r) {
> + r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type,
> + ring->me, ring->pipe,
> + ring->queue, ring->xcc_id);
> + DRM_INFO("ring: %s pipe reset :%s\n", ring->name,
> + r ? "failed" : "successfully");
> + if (r)
> + return r;
> }
>
> r = amdgpu_bo_reserve(ring->mqd_obj, false);
> --
> 2.34.1
More information about the amd-gfx
mailing list