[PATCH 3/3] drm/amdgpu: partially revert "reduce reset time"
Alex Deucher
alexdeucher at gmail.com
Mon Dec 16 17:50:21 UTC 2024
On Fri, Dec 13, 2024 at 4:03 AM Christian König
<ckoenig.leichtzumerken at gmail.com> wrote:
>
> This partially reverts commit 194eb174cbe4fe2b3376ac30acca2dc8c8beca00.
>
> This commit introduced a new state variable into adev without even
> remotely worrying about CPU barriers.
>
> Since we already have the amdgpu_in_reset() function exactly for this
> use case partially revert that.
>
> Signed-off-by: Christian König <christian.koenig at amd.com>
Series is:
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 -
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 ---
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 +-
> 5 files changed, 4 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 7051b697530b..5e55a44f9eef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1192,7 +1192,6 @@ struct amdgpu_device {
>
> struct work_struct reset_work;
>
> - bool job_hang;
> bool dc_enabled;
> /* Mask of active clusters */
> uint32_t aid_mask;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 503051352922..dca5a4ef2734 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -836,7 +836,7 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> return -EINVAL;
>
> - if (!kiq_ring->sched.ready || adev->job_hang)
> + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev))
> return 0;
>
> ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a4dde54512b1..2e6829e1554b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -515,7 +515,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> return -EINVAL;
>
> - if (!kiq_ring->sched.ready || adev->job_hang || amdgpu_in_reset(adev))
> + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev))
> return 0;
>
> spin_lock(&kiq->ring_lock);
> @@ -567,7 +567,7 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
> if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> return -EINVAL;
>
> - if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
> + if (!adev->gfx.kiq[0].ring.sched.ready || amdgpu_in_reset(adev))
> return 0;
>
> if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 7fdf7b047317..6cc44eb2586f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -102,8 +102,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> return DRM_GPU_SCHED_STAT_ENODEV;
> }
>
> - adev->job_hang = true;
> -
> /*
> * Do the coredump immediately after a job timeout to get a very
> * close dump/snapshot/representation of GPU's current error status
> @@ -181,7 +179,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> }
>
> exit:
> - adev->job_hang = false;
> drm_dev_exit(idx);
> return DRM_GPU_SCHED_STAT_NOMINAL;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index c4e15418e187..714a6caeb679 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5957,7 +5957,7 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
> else
> WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
>
> - if (adev->job_hang && !enable)
> + if (amdgpu_in_reset(adev) && !enable)
> return 0;
>
> for (i = 0; i < adev->usec_timeout; i++) {
> --
> 2.34.1
>
More information about the amd-gfx
mailing list