[PATCH 9/9] drm/amdgpu/userq: integrate with enforce isolation

Fri Apr 11 16:30:13 UTC 2025

Are we replacing the kfx user queue with KGD userqueue names here?
Also this looks like KFD user queue and KGD userqueue are both treated 
at par ?

Looks good in general if the above understanding is correct. Some one 
with better understanding of isolation should review.
Acked-by: Sunil Khatri <sunil.khatri at amd.com>

On 4/10/2025 11:41 PM, Alex Deucher wrote:
> Enforce isolation serializes access to the GFX IP.  User
> queues are isolated in the MES scheduler, but we still
> need to serialize between kernel queues and user queues.
> For enforce isolation, group KGD user queues with KFD user
> queues.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 34 ++++++++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h    |  6 ++--
>   3 files changed, 22 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 4e1c97a919cec..3c6679fce7c20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4344,7 +4344,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   		amdgpu_sync_create(&adev->isolation[i].active);
>   		amdgpu_sync_create(&adev->isolation[i].prev);
>   	}
> -	mutex_init(&adev->gfx.kfd_sch_mutex);
> +	mutex_init(&adev->gfx.userq_sch_mutex);
>   	mutex_init(&adev->gfx.workload_profile_mutex);
>   	mutex_init(&adev->vcn.workload_profile_mutex);
>   	mutex_init(&adev->userq_mutex);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a42ac1060fa92..e08323f601535 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1928,39 +1928,41 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev,
>   static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
>   				    bool enable)
>   {
> -	mutex_lock(&adev->gfx.kfd_sch_mutex);
> +	mutex_lock(&adev->gfx.userq_sch_mutex);
>   
>   	if (enable) {
>   		/* If the count is already 0, it means there's an imbalance bug somewhere.
>   		 * Note that the bug may be in a different caller than the one which triggers the
>   		 * WARN_ON_ONCE.
>   		 */
> -		if (WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx] == 0)) {
> +		if (WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx] == 0)) {
>   			dev_err(adev->dev, "Attempted to enable KFD scheduler when reference count is already zero\n");
>   			goto unlock;
>   		}
>   
> -		adev->gfx.kfd_sch_req_count[idx]--;
> +		adev->gfx.userq_sch_req_count[idx]--;
>   
> -		if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
> -		    adev->gfx.kfd_sch_inactive[idx]) {
> +		if (adev->gfx.userq_sch_req_count[idx] == 0 &&
> +		    adev->gfx.userq_sch_inactive[idx]) {
>   			schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
>   					      msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx]));
>   		}
>   	} else {
> -		if (adev->gfx.kfd_sch_req_count[idx] == 0) {
> +		if (adev->gfx.userq_sch_req_count[idx] == 0) {
>   			cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work);
> -			if (!adev->gfx.kfd_sch_inactive[idx]) {
> -				amdgpu_amdkfd_stop_sched(adev, idx);
> -				adev->gfx.kfd_sch_inactive[idx] = true;
> +			if (!adev->gfx.userq_sch_inactive[idx]) {
> +				amdgpu_userq_stop_sched(adev, idx);
> +				if (adev->kfd.init_complete)
> +					amdgpu_amdkfd_stop_sched(adev, idx);
> +				adev->gfx.userq_sch_inactive[idx] = true;
>   			}
>   		}
>   
> -		adev->gfx.kfd_sch_req_count[idx]++;
> +		adev->gfx.userq_sch_req_count[idx]++;
>   	}
>   
>   unlock:
> -	mutex_unlock(&adev->gfx.kfd_sch_mutex);
> +	mutex_unlock(&adev->gfx.userq_sch_mutex);
>   }
>   
>   /**
> @@ -2005,12 +2007,12 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
>   				      msecs_to_jiffies(1));
>   	} else {
>   		/* Tell KFD to resume the runqueue */
> -		if (adev->kfd.init_complete) {
> -			WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]);
> -			WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]);
> +		WARN_ON_ONCE(!adev->gfx.userq_sch_inactive[idx]);
> +		WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx]);
> +		amdgpu_userq_start_sched(adev, idx);
> +		if (adev->kfd.init_complete)
>   			amdgpu_amdkfd_start_sched(adev, idx);
> -			adev->gfx.kfd_sch_inactive[idx] = false;
> -		}
> +		adev->gfx.userq_sch_inactive[idx] = false;
>   	}
>   	mutex_unlock(&adev->enforce_isolation_mutex);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index caaddab31023f..70b64bb1847c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -475,9 +475,9 @@ struct amdgpu_gfx {
>   	bool				enable_cleaner_shader;
>   	struct amdgpu_isolation_work	enforce_isolation[MAX_XCP];
>   	/* Mutex for synchronizing KFD scheduler operations */
> -	struct mutex                    kfd_sch_mutex;
> -	u64				kfd_sch_req_count[MAX_XCP];
> -	bool				kfd_sch_inactive[MAX_XCP];
> +	struct mutex                    userq_sch_mutex;
> +	u64				userq_sch_req_count[MAX_XCP];
> +	bool				userq_sch_inactive[MAX_XCP];
>   	unsigned long			enforce_isolation_jiffies[MAX_XCP];
>   	unsigned long			enforce_isolation_time[MAX_XCP];
>