[PATCH] drm/amdgpu/mes: keep enforce isolation up to date

SRINIVASAN SHANMUGAM srinivasan.shanmugam at amd.com
Sat Feb 15 07:52:27 UTC 2025


On 2/14/2025 11:05 PM, Alex Deucher wrote:
> Re-send the mes message on resume to make sure the
> mes state is up to date.
>
> Fixes: 8521e3c5f058 ("drm/amd/amdgpu: limit single process inside MES")
> Signed-off-by: Alex Deucher<alexander.deucher at amd.com>
> Cc: Shaoyun Liu<shaoyun.liu at amd.com>
> Cc: Srinivasan Shanmugam<srinivasan.shanmugam at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 13 ++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 +++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 +-
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c  |  4 ++++
>   5 files changed, 32 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index b9bd6654f3172..a194bf3347cbc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1665,24 +1665,19 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>   	}
>   
>   	mutex_lock(&adev->enforce_isolation_mutex);
> -
>   	for (i = 0; i < num_partitions; i++) {
> -		if (adev->enforce_isolation[i] && !partition_values[i]) {
> +		if (adev->enforce_isolation[i] && !partition_values[i])
>   			/* Going from enabled to disabled */
>   			amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i));
> -			if (adev->enable_mes && adev->gfx.enable_cleaner_shader)
> -				amdgpu_mes_set_enforce_isolation(adev, i, false);
> -		} else if (!adev->enforce_isolation[i] && partition_values[i]) {
> +		else if (!adev->enforce_isolation[i] && partition_values[i])
>   			/* Going from disabled to enabled */
>   			amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
> -			if (adev->enable_mes && adev->gfx.enable_cleaner_shader)
> -				amdgpu_mes_set_enforce_isolation(adev, i, true);
> -		}
>   		adev->enforce_isolation[i] = partition_values[i];
>   	}
> -
>   	mutex_unlock(&adev->enforce_isolation_mutex);
>   
> +	amdgpu_mes_update_enforce_isolation(adev);
> +
>   	return count;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index cee38bb6cfaf2..ca076306adba4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -1508,7 +1508,8 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
>   }
>   
>   /* Fix me -- node_id is used to identify the correct MES instances in the future */
> -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable)
> +static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
> +					    uint32_t node_id, bool enable)
>   {
>   	struct mes_misc_op_input op_input = {0};
>   	int r;
> @@ -1530,6 +1531,23 @@ int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_i
>   	return r;
>   }
>   
> +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev)
> +{
> +	int i, r = 0;
> +
> +	if (adev->enable_mes && adev->gfx.enable_cleaner_shader) {
> +		mutex_lock(&adev->enforce_isolation_mutex);
> +		for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> +			if (adev->enforce_isolation[i])
> +				r |= amdgpu_mes_set_enforce_isolation(adev, i, true);
> +			else
> +				r |= amdgpu_mes_set_enforce_isolation(adev, i, false);
> +		}
> +		mutex_unlock(&adev->enforce_isolation_mutex);
> +	}
> +	return r;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
>   static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 6a792ffc81e33..3a65c3788956d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -532,6 +532,6 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
>   
>   bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
>   
> -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable);
> +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>   
>   #endif /* __AMDGPU_MES_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 530371e6a7aee..fc7b17463cb4d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -1660,6 +1660,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block)
>   		goto failure;
>   	}
>   
> +	r = amdgpu_mes_update_enforce_isolation(adev);
> +	if (r)
> +		goto failure;
> +

Hi Alex,

Should this also be moved to mes_v11_0_hw_init. Please let me know your 
thoughts?

>   out:
>   	/*
>   	 * Disable KIQ ring usage from the driver once MES is enabled.
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index 6db88584dd529..ec91c78468f30 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -1773,6 +1773,10 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block)
>   		goto failure;
>   	}
>   
> +	r = amdgpu_mes_update_enforce_isolation(adev);
> +	if (r)
> +		goto failure;
> +

And Similarly here also?

Thanks!

Srini

>   out:
>   	/*
>   	 * Disable KIQ ring usage from the driver once MES is enabled.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20250215/d3ab7e25/attachment.htm>


More information about the amd-gfx mailing list