[PATCH] drm/amd/amdgpu: limit single process inside MES

Liu, Shaoyun Shaoyun.Liu at amd.com
Thu Oct 24 14:36:26 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

If the  old FW doesn't support the  isolation feature, it won't check that bit, the  setting there will be ignored , so it won't cause the  problem .

Regards
Shaoyun.liu

-----Original Message-----
From: Alex Deucher <alexdeucher at gmail.com>
Sent: Thursday, October 24, 2024 9:21 AM
To: Liu, Shaoyun <Shaoyun.Liu at amd.com>
Cc: amd-gfx at lists.freedesktop.org
Subject: Re: [PATCH] drm/amd/amdgpu: limit single process inside MES

On Wed, Oct 23, 2024 at 8:48 PM Shaoyun Liu <shaoyun.liu at amd.com> wrote:
>
> This is for MES to limit only one process for the user queues
>
> Signed-off-by: Shaoyun Liu <shaoyun.liu at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 24 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 +++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 15 +++++++++++++++
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c  | 11 +++++++++++
>  5 files changed, 71 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..72e38d621a29 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1576,9 +1576,11 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>                 if (adev->enforce_isolation[i] && !partition_values[i]) {
>                         /* Going from enabled to disabled */
>                         amdgpu_vmid_free_reserved(adev,
> AMDGPU_GFXHUB(i));
> +                       amdgpu_mes_set_enforce_isolation(adev, i,
> + false);
>                 } else if (!adev->enforce_isolation[i] && partition_values[i]) {
>                         /* Going from disabled to enabled */
>                         amdgpu_vmid_alloc_reserved(adev,
> AMDGPU_GFXHUB(i));
> +                       amdgpu_mes_set_enforce_isolation(adev, i,
> + true);
>                 }
>                 adev->enforce_isolation[i] = partition_values[i];
>         }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index bf584e9bcce4..dfc7d320fcbc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -1674,6 +1674,30 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
>         return is_supported;
>  }
>
> +/* Fix me -- node_id is used to identify the correct MES instances in
> +the future */ int amdgpu_mes_set_enforce_isolation(struct
> +amdgpu_device *adev, uint32_t node_id, bool enable) {
> +       struct mes_misc_op_input op_input = {0};
> +       int r;
> +
> +       op_input.op = MES_MISC_OP_CHANGE_CONFIG;
> +       op_input.change_config.option.limit_single_process = enable ?
> + 1 : 0;
> +
> +       if (!adev->mes.funcs->misc_op) {
> +               dev_err(adev->dev,"mes change config is not supported!\n");
> +               r = -EINVAL;
> +               goto error;
> +       }
> +
> +       r = adev->mes.funcs->misc_op(&adev->mes, &op_input);
> +       if (r)
> +               dev_err(adev->dev, "failed to change_config.\n");
> +
> +error:
> +       return r;
> +
> +}
> +
>  #if defined(CONFIG_DEBUG_FS)
>
>  static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void
> *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 79f13d7e5e16..91bff6443c05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -311,6 +311,7 @@ enum mes_misc_opcode {
>         MES_MISC_OP_WRM_REG_WAIT,
>         MES_MISC_OP_WRM_REG_WR_WAIT,
>         MES_MISC_OP_SET_SHADER_DEBUGGER,
> +       MES_MISC_OP_CHANGE_CONFIG,
>  };
>
>  struct mes_misc_op_input {
> @@ -349,6 +350,21 @@ struct mes_misc_op_input {
>                         uint32_t tcp_watch_cntl[4];
>                         uint32_t trap_en;
>                 } set_shader_debugger;
> +
> +               struct {
> +                       union {
> +                               struct {
> +                                       uint32_t limit_single_process : 1;
> +                                       uint32_t enable_hws_logging_buffer : 1;
> +                                       uint32_t reserved : 30;
> +                               };
> +                               uint32_t all;
> +                       } option;
> +                       struct {
> +                               uint32_t tdr_level;
> +                               uint32_t tdr_delay;
> +                       } tdr_config;
> +               } change_config;
>         };
>  };
>
> @@ -519,4 +535,7 @@ static inline void amdgpu_mes_unlock(struct
> amdgpu_mes *mes)  }
>
>  bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device
> *adev);
> +
> +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
> +uint32_t node_id, bool enable);
> +
>  #endif /* __AMDGPU_MES_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 57db0c006c8f..c621ba805433 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -644,6 +644,18 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
>                                 sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
>                 misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en;
>                 break;
> +       case MES_MISC_OP_CHANGE_CONFIG:
> +               if ((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) < 0x63) {
> +                       dev_err(adev->dev, "MES FW versoin must be larger than 0x63 to support limit single process feature.\n");
> +                       return -EINVAL;
> +               }
> +               misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG;
> +               misc_pkt.change_config.opcode =
> +                               MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS;
> +               misc_pkt.change_config.option.bits.limit_single_process =
> +                               input->change_config.option.limit_single_process;
> +               break;
> +
>         default:
>                 DRM_ERROR("unsupported misc op (%d) \n", input->op);
>                 return -EINVAL;
> @@ -719,6 +731,9 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
>                                         mes->event_log_gpu_addr;
>         }
>
> +       if (enforce_isolation)
> +               mes_set_hw_res_pkt.limit_single_process =1;

Assuming that setting this on old firmware will not cause a problem?
If so we need a firmware check here as well.  If not, the patch is:
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>

> +
>         return mes_v11_0_submit_pkt_and_poll_completion(mes,
>                         &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
>                         offsetof(union MESAPI_SET_HW_RESOURCES,
> api_status)); diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 9d0e342a2f81..26d1b82721ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -531,6 +531,14 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes,
>                                 sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
>                 misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en;
>                 break;
> +       case MES_MISC_OP_CHANGE_CONFIG:
> +               misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG;
> +               misc_pkt.change_config.opcode =
> +                               MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS;
> +               misc_pkt.change_config.option.bits.limit_single_process =
> +                               input->change_config.option.limit_single_process;
> +               break;
> +
>         default:
>                 DRM_ERROR("unsupported misc op (%d) \n", input->op);
>                 return -EINVAL;
> @@ -633,6 +641,9 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
>                 mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr + pipe * AMDGPU_MES_LOG_BUFFER_SIZE;
>         }
>
> +       if (enforce_isolation)
> +               mes_set_hw_res_pkt.limit_single_process =1;
> +
>         return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
>                         &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
>                         offsetof(union MESAPI_SET_HW_RESOURCES,
> api_status));
> --
> 2.34.1
>


More information about the amd-gfx mailing list