[PATCH] drm/amd/amdgpu: limit single process inside MES
Alex Deucher
alexdeucher at gmail.com
Wed Oct 23 19:32:04 UTC 2024
On Wed, Oct 23, 2024 at 2:08 PM Shaoyun Liu <shaoyun.liu at amd.com> wrote:
>
> This is for MES to limit only one process for the user queues
>
> Signed-off-by: Shaoyun Liu <shaoyun.liu at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 24 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 +++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 11 +++++++++++
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 11 +++++++++++
> 5 files changed, 67 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index e96984c53e72..72e38d621a29 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1576,9 +1576,11 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
> if (adev->enforce_isolation[i] && !partition_values[i]) {
> /* Going from enabled to disabled */
> amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i));
> + amdgpu_mes_set_enforce_isolation(adev, i, false);
> } else if (!adev->enforce_isolation[i] && partition_values[i]) {
> /* Going from disabled to enabled */
> amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
> + amdgpu_mes_set_enforce_isolation(adev, i, true);
> }
> adev->enforce_isolation[i] = partition_values[i];
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index bf584e9bcce4..29b6a2baae4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -1674,6 +1674,30 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
> return is_supported;
> }
>
> +/* Fix me -- node_id is used to identify the correct MES instances in the future */
> +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable)
> +{
> + struct mes_misc_op_input op_input = {0};
> + int r;
> +
> + op_input.op = MES_MISC_OP_CHANGE_CONFIG;
> + op_input.change_config.option.limit_single_process = enable ? 1 : 0;
> +
> + if (!adev->mes.funcs->misc_op) {
> + DRM_ERROR("mes change config is not supported!\n");
Please use dev_err() so it's clear which GPU the error is coming from
in a multi-GPU system.
> + r = -EINVAL;
> + goto error;
> + }
> +
> + r = adev->mes.funcs->misc_op(&adev->mes, &op_input);
> + if (r)
> + DRM_ERROR("failed to change_config.\n");
dev_err()
> +
> +error:
> + return r;
> +
> +}
> +
> #if defined(CONFIG_DEBUG_FS)
>
> static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 79f13d7e5e16..91bff6443c05 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -311,6 +311,7 @@ enum mes_misc_opcode {
> MES_MISC_OP_WRM_REG_WAIT,
> MES_MISC_OP_WRM_REG_WR_WAIT,
> MES_MISC_OP_SET_SHADER_DEBUGGER,
> + MES_MISC_OP_CHANGE_CONFIG,
> };
>
> struct mes_misc_op_input {
> @@ -349,6 +350,21 @@ struct mes_misc_op_input {
> uint32_t tcp_watch_cntl[4];
> uint32_t trap_en;
> } set_shader_debugger;
> +
> + struct {
> + union {
> + struct {
> + uint32_t limit_single_process : 1;
> + uint32_t enable_hws_logging_buffer : 1;
> + uint32_t reserved : 30;
> + };
> + uint32_t all;
> + } option;
> + struct {
> + uint32_t tdr_level;
> + uint32_t tdr_delay;
> + } tdr_config;
> + } change_config;
> };
> };
>
> @@ -519,4 +535,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
> }
>
> bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +
> +int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable);
> +
> #endif /* __AMDGPU_MES_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 57db0c006c8f..1d6de7bced48 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -644,6 +644,14 @@ static int mes_v11_0_misc_op(struct amdgpu_mes *mes,
> sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
> misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en;
> break;
> + case MES_MISC_OP_CHANGE_CONFIG:
> + misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG;
> + misc_pkt.change_config.opcode =
> + MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS;
> + misc_pkt.change_config.option.bits.limit_single_process =
> + input->change_config.option.limit_single_process;
We should add a firmware version check here and and return an error if
the fw version is too old to support this packet.
> + break;
> +
> default:
> DRM_ERROR("unsupported misc op (%d) \n", input->op);
> return -EINVAL;
> @@ -719,6 +727,9 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
> mes->event_log_gpu_addr;
> }
>
> + if(enforce_isolation)
missing space between if and (.
> + mes_set_hw_res_pkt.limit_single_process =1;
> +
> return mes_v11_0_submit_pkt_and_poll_completion(mes,
> &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
> offsetof(union MESAPI_SET_HW_RESOURCES, api_status));
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 9d0e342a2f81..85eff9b777c2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -531,6 +531,14 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes,
> sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
> misc_pkt.set_shader_debugger.trap_en = input->set_shader_debugger.trap_en;
> break;
> + case MES_MISC_OP_CHANGE_CONFIG:
> + misc_pkt.opcode = MESAPI_MISC__CHANGE_CONFIG;
> + misc_pkt.change_config.opcode =
> + MESAPI_MISC__CHANGE_CONFIG_OPTION_LIMIT_SINGLE_PROCESS;
> + misc_pkt.change_config.option.bits.limit_single_process =
> + input->change_config.option.limit_single_process;
> + break;
> +
> default:
> DRM_ERROR("unsupported misc op (%d) \n", input->op);
> return -EINVAL;
> @@ -633,6 +641,9 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
> mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr + pipe * AMDGPU_MES_LOG_BUFFER_SIZE;
> }
>
> + if(enforce_isolation)
missing space between if and (.
Other than that, looks good to me.
Alex
> + mes_set_hw_res_pkt.limit_single_process =1;
> +
> return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> &mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),
> offsetof(union MESAPI_SET_HW_RESOURCES, api_status));
> --
> 2.34.1
>
More information about the amd-gfx
mailing list