[PATCH v2] drm/amdgpu: extend halt_if_hws_hang to MES
Felix Kuehling
felix.kuehling at amd.com
Tue Nov 1 19:16:58 UTC 2022
On 2022-11-01 10:46, Graham Sider wrote:
> Hang on MES timeout if halt_if_hws_hang is set to 1.
>
> Signed-off-by: Graham Sider <Graham.Sider at amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/mes_v10_1.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 4 ++++
> 3 files changed, 10 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0e6ddf05c23c..9999c18e7d8e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -219,10 +219,12 @@ extern int amdgpu_use_xgmi_p2p;
> extern int sched_policy;
> extern bool debug_evictions;
> extern bool no_system_mem_limit;
> +extern int halt_if_hws_hang;
> #else
> static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;
> static const bool __maybe_unused debug_evictions; /* = false */
> static const bool __maybe_unused no_system_mem_limit;
> +static const int __maybe_unused halt_if_hws_hang;
> #endif
> #ifdef CONFIG_HSA_AMD_P2P
> extern bool pcie_p2p;
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
> index 1abdf8b7ab50..614394118a53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
> @@ -121,6 +121,10 @@ static int mes_v10_1_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r < 1) {
> DRM_ERROR("MES failed to response msg=%d\n",
> x_pkt->header.opcode);
> +
> + while (halt_if_hws_hang)
> + schedule();
> +
> return -ETIMEDOUT;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 27a330f51c7d..7bfe862aa83e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -129,6 +129,10 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
> if (r < 1) {
> DRM_ERROR("MES failed to response msg=%d\n",
> x_pkt->header.opcode);
> +
> + while (halt_if_hws_hang)
> + schedule();
> +
> return -ETIMEDOUT;
> }
>
More information about the amd-gfx
mailing list