[PATCH] drm/amdgpu/mes: keep enforce isolation up to date

Fri Feb 21 13:35:01 UTC 2025

On 2/17/2025 8:05 PM, Alex Deucher wrote:
> On Mon, Feb 17, 2025 at 9:18 AM SRINIVASAN SHANMUGAM
> <srinivasan.shanmugam at amd.com> wrote:
>>
>> On 2/17/2025 7:44 PM, Alex Deucher wrote:
>>> On Sat, Feb 15, 2025 at 3:02 AM SRINIVASAN SHANMUGAM
>>> <srinivasan.shanmugam at amd.com> wrote:
>>>> On 2/14/2025 11:05 PM, Alex Deucher wrote:
>>>>
>>>> Re-send the mes message on resume to make sure the
>>>> mes state is up to date.
>>>>
>>>> Fixes: 8521e3c5f058 ("drm/amd/amdgpu: limit single process inside MES")
>>>> Signed-off-by: Alex Deucher<alexander.deucher at amd.com>
>>>> Cc: Shaoyun Liu<shaoyun.liu at amd.com>
>>>> Cc: Srinivasan Shanmugam<srinivasan.shanmugam at amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 13 ++++---------
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 +++++++++++++++++++-
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 +-
>>>>    drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  4 ++++
>>>>    drivers/gpu/drm/amd/amdgpu/mes_v12_0.c  |  4 ++++
>>>>    5 files changed, 32 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index b9bd6654f3172..a194bf3347cbc 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -1665,24 +1665,19 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>>>>     }
>>>>
>>>>     mutex_lock(&adev->enforce_isolation_mutex);
>>>> -
>>>>     for (i = 0; i < num_partitions; i++) {
>>>> - if (adev->enforce_isolation[i] && !partition_values[i]) {
>>>> + if (adev->enforce_isolation[i] && !partition_values[i])
>>>>     /* Going from enabled to disabled */
>>>>     amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i));
>>>> - if (adev->enable_mes && adev->gfx.enable_cleaner_shader)
>>>> - amdgpu_mes_set_enforce_isolation(adev, i, false);
>>>> - } else if (!adev->enforce_isolation[i] && partition_values[i]) {
>>>> + else if (!adev->enforce_isolation[i] && partition_values[i])
>>>>     /* Going from disabled to enabled */
>>>>     amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
>>>> - if (adev->enable_mes && adev->gfx.enable_cleaner_shader)
>>>> - amdgpu_mes_set_enforce_isolation(adev, i, true);
>>>> - }
>>>>     adev->enforce_isolation[i] = partition_values[i];
>>>>     }
>>>> -
>>>>     mutex_unlock(&adev->enforce_isolation_mutex);
>>>>
>>>> + amdgpu_mes_update_enforce_isolation(adev);
>>>> +
>>>>     return count;
>>>>    }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>>> index cee38bb6cfaf2..ca076306adba4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
>>>> @@ -1508,7 +1508,8 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
>>>>    }
>>>>
>>>>    /* Fix me -- node_id is used to identify the correct MES instances in the future */
>>>> -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable)
>>>> +static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
>>>> +    uint32_t node_id, bool enable)
>>>>    {
>>>>     struct mes_misc_op_input op_input = {0};
>>>>     int r;
>>>> @@ -1530,6 +1531,23 @@ int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_i
>>>>     return r;
>>>>    }
>>>>
>>>> +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev)
>>>> +{
>>>> + int i, r = 0;
>>>> +
>>>> + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) {
>>>> + mutex_lock(&adev->enforce_isolation_mutex);
>>>> + for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
>>>> + if (adev->enforce_isolation[i])
>>>> + r |= amdgpu_mes_set_enforce_isolation(adev, i, true);
>>>> + else
>>>> + r |= amdgpu_mes_set_enforce_isolation(adev, i, false);
>>>> + }
>>>> + mutex_unlock(&adev->enforce_isolation_mutex);
>>>> + }
>>>> + return r;
>>>> +}
>>>> +
>>>>    #if defined(CONFIG_DEBUG_FS)
>>>>
>>>>    static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>>> index 6a792ffc81e33..3a65c3788956d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
>>>> @@ -532,6 +532,6 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes)
>>>>
>>>>    bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
>>>>
>>>> -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable);
>>>> +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>>>>
>>>>    #endif /* __AMDGPU_MES_H__ */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>>> index 530371e6a7aee..fc7b17463cb4d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
>>>> @@ -1660,6 +1660,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block)
>>>>     goto failure;
>>>>     }
>>>>
>>>> + r = amdgpu_mes_update_enforce_isolation(adev);
>>>> + if (r)
>>>> + goto failure;
>>>> +
>>>>
>>>> Hi Alex,
>>>>
>>>> Should this also be moved to mes_v11_0_hw_init. Please let me know your thoughts?
>>> I'm not sure I follow.  This is in hw_init.
>>>
>>> Alex
>> Sorry, my mistake mes_v11_0_sw_init pls?
> There's no need to call it in sw_init, plus the hw is not set up in
> sw_init so you can't call it there anyway.  The whole point of this is
> to update the firmware with the current sw state after a suspend or
> reset.

Based on this understanding, at resume time - it is just the message 
that is needed to be sent only to MES, to make sure we re-enable 
enforce_isolation workaround after suspend.

Acked-by: Srinivasan Shanmugam <srinivasan.shanmugam at amd.com>

> Alex
>
>> Thanks!
>>
>> Srini
>>
>>>>    out:
>>>>     /*
>>>>     * Disable KIQ ring usage from the driver once MES is enabled.
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>>>> index 6db88584dd529..ec91c78468f30 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
>>>> @@ -1773,6 +1773,10 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block)
>>>>     goto failure;
>>>>     }
>>>>
>>>> + r = amdgpu_mes_update_enforce_isolation(adev);
>>>> + if (r)
>>>> + goto failure;
>>>> +
>>>>
>>>> And Similarly here also?
>>>>
>>>> Thanks!
>>>>
>>>> Srini
>>>>
>>>>    out:
>>>>     /*
>>>>     * Disable KIQ ring usage from the driver once MES is enabled.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20250221/ef0faa60/attachment-0001.htm>