[PATCH 9/9] drm/amdgpu/userq: integrate with enforce isolation
Khatri, Sunil
sukhatri at amd.com
Fri Apr 11 16:30:13 UTC 2025
Are we replacing the kfx user queue with KGD userqueue names here?
Also this looks like KFD user queue and KGD userqueue are both treated
at par ?
Looks good in general if the above understanding is correct. Some one
with better understanding of isolation should review.
Acked-by: Sunil Khatri <sunil.khatri at amd.com>
On 4/10/2025 11:41 PM, Alex Deucher wrote:
> Enforce isolation serializes access to the GFX IP. User
> queues are isolated in the MES scheduler, but we still
> need to serialize between kernel queues and user queues.
> For enforce isolation, group KGD user queues with KFD user
> queues.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 34 ++++++++++++----------
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 6 ++--
> 3 files changed, 22 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 4e1c97a919cec..3c6679fce7c20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4344,7 +4344,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> amdgpu_sync_create(&adev->isolation[i].active);
> amdgpu_sync_create(&adev->isolation[i].prev);
> }
> - mutex_init(&adev->gfx.kfd_sch_mutex);
> + mutex_init(&adev->gfx.userq_sch_mutex);
> mutex_init(&adev->gfx.workload_profile_mutex);
> mutex_init(&adev->vcn.workload_profile_mutex);
> mutex_init(&adev->userq_mutex);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a42ac1060fa92..e08323f601535 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1928,39 +1928,41 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
> static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
> bool enable)
> {
> - mutex_lock(&adev->gfx.kfd_sch_mutex);
> + mutex_lock(&adev->gfx.userq_sch_mutex);
>
> if (enable) {
> /* If the count is already 0, it means there's an imbalance bug somewhere.
> * Note that the bug may be in a different caller than the one which triggers the
> * WARN_ON_ONCE.
> */
> - if (WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx] == 0)) {
> + if (WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx] == 0)) {
> dev_err(adev->dev, "Attempted to enable KFD scheduler when reference count is already zero\n");
> goto unlock;
> }
>
> - adev->gfx.kfd_sch_req_count[idx]--;
> + adev->gfx.userq_sch_req_count[idx]--;
>
> - if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
> - adev->gfx.kfd_sch_inactive[idx]) {
> + if (adev->gfx.userq_sch_req_count[idx] == 0 &&
> + adev->gfx.userq_sch_inactive[idx]) {
> schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
> msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx]));
> }
> } else {
> - if (adev->gfx.kfd_sch_req_count[idx] == 0) {
> + if (adev->gfx.userq_sch_req_count[idx] == 0) {
> cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work);
> - if (!adev->gfx.kfd_sch_inactive[idx]) {
> - amdgpu_amdkfd_stop_sched(adev, idx);
> - adev->gfx.kfd_sch_inactive[idx] = true;
> + if (!adev->gfx.userq_sch_inactive[idx]) {
> + amdgpu_userq_stop_sched(adev, idx);
> + if (adev->kfd.init_complete)
> + amdgpu_amdkfd_stop_sched(adev, idx);
> + adev->gfx.userq_sch_inactive[idx] = true;
> }
> }
>
> - adev->gfx.kfd_sch_req_count[idx]++;
> + adev->gfx.userq_sch_req_count[idx]++;
> }
>
> unlock:
> - mutex_unlock(&adev->gfx.kfd_sch_mutex);
> + mutex_unlock(&adev->gfx.userq_sch_mutex);
> }
>
> /**
> @@ -2005,12 +2007,12 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
> msecs_to_jiffies(1));
> } else {
> /* Tell KFD to resume the runqueue */
> - if (adev->kfd.init_complete) {
> - WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]);
> - WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]);
> + WARN_ON_ONCE(!adev->gfx.userq_sch_inactive[idx]);
> + WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx]);
> + amdgpu_userq_start_sched(adev, idx);
> + if (adev->kfd.init_complete)
> amdgpu_amdkfd_start_sched(adev, idx);
> - adev->gfx.kfd_sch_inactive[idx] = false;
> - }
> + adev->gfx.userq_sch_inactive[idx] = false;
> }
> mutex_unlock(&adev->enforce_isolation_mutex);
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index caaddab31023f..70b64bb1847c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -475,9 +475,9 @@ struct amdgpu_gfx {
> bool enable_cleaner_shader;
> struct amdgpu_isolation_work enforce_isolation[MAX_XCP];
> /* Mutex for synchronizing KFD scheduler operations */
> - struct mutex kfd_sch_mutex;
> - u64 kfd_sch_req_count[MAX_XCP];
> - bool kfd_sch_inactive[MAX_XCP];
> + struct mutex userq_sch_mutex;
> + u64 userq_sch_req_count[MAX_XCP];
> + bool userq_sch_inactive[MAX_XCP];
> unsigned long enforce_isolation_jiffies[MAX_XCP];
> unsigned long enforce_isolation_time[MAX_XCP];
>
More information about the amd-gfx
mailing list