[PATCH 3/3] drm/amdgpu: adjust enforce_isolation handling

SRINIVASAN SHANMUGAM srinivasan.shanmugam at amd.com
Wed Apr 9 14:35:55 UTC 2025


On 4/8/2025 9:30 PM, Alex Deucher wrote:
> Switch from a bool to an enum and allow more options
> for enforce isolation.  There are now 3 modes of operation:
> - Disabled (0)
> - Enabled (serialization and cleaner shader) (1)
> - Enabled in legacy mode (no serialization or cleaner shader) (2)
> This provides better flexibility for more use cases.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 11 +++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 16 +++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 22 ++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       | 12 +++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 39 ++++++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c       |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h       |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  3 +-
>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  2 +-
>   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c        |  2 +-
>   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 11 +++---
>   12 files changed, 93 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 804d377037095..468c6ce09e3ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -230,7 +230,7 @@ extern int amdgpu_force_asic_type;
>   extern int amdgpu_smartshift_bias;
>   extern int amdgpu_use_xgmi_p2p;
>   extern int amdgpu_mtype_local;
> -extern bool enforce_isolation;
> +extern int amdgpu_enforce_isolation;
>   #ifdef CONFIG_HSA_AMD
>   extern int sched_policy;
>   extern bool debug_evictions;
> @@ -873,6 +873,13 @@ struct amdgpu_init_level {
>   struct amdgpu_reset_domain;
>   struct amdgpu_fru_info;
>   
> +enum amdgpu_enforce_isolation_mode {
> +	AMDGPU_ENFORCE_ISOLATION_DISABLE = 0,
> +	AMDGPU_ENFORCE_ISOLATION_ENABLE = 1,
> +	AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY = 2,
> +};
> +
> +
>   /*
>    * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
>    */
> @@ -1224,7 +1231,7 @@ struct amdgpu_device {
>   
>   	/* Protection for the following isolation structure */
>   	struct mutex                    enforce_isolation_mutex;
> -	bool				enforce_isolation[MAX_XCP];
> +	enum amdgpu_enforce_isolation_mode	enforce_isolation[MAX_XCP];
>   	struct amdgpu_isolation {
>   		void			*owner;
>   		struct dma_fence	*spearhead;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 4a5cd7a111fc4..34a688fc5bf35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -296,7 +296,21 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
>   				       num_ibs[i], &p->jobs[i]);
>   		if (ret)
>   			goto free_all_kdata;
> -		p->jobs[i]->enforce_isolation = p->adev->enforce_isolation[fpriv->xcp_id];
> +		switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
> +		case AMDGPU_ENFORCE_ISOLATION_DISABLE:
> +		default:
> +			p->jobs[i]->enforce_isolation = false;
> +			p->jobs[i]->run_cleaner_shader = false;
> +			break;
> +		case AMDGPU_ENFORCE_ISOLATION_ENABLE:
> +			p->jobs[i]->enforce_isolation = true;
> +			p->jobs[i]->run_cleaner_shader = true;
> +			break;
> +		case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
> +			p->jobs[i]->enforce_isolation = true;
> +			p->jobs[i]->run_cleaner_shader = false;


Hi Alex,

Even for legacy enforce_isolation, just to double check, we expect 
cleaner shader to run, cz by default when we trigger modprobe amdgpu 
enforce_isolation = 1, for this usecase, we expect cleaner shader to be 
triggered that means legacy , am I correct pls?

Best regards,
Srini


> +			break;
> +		}
>   	}
>   	p->gang_leader = p->jobs[p->gang_leader_idx];
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8b43f350447a9..700304bbe3982 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2115,8 +2115,26 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
>   
>   	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
>   
> -	for (i = 0; i < MAX_XCP; i++)
> -		adev->enforce_isolation[i] = !!enforce_isolation;
> +	for (i = 0; i < MAX_XCP; i++) {
> +		switch (amdgpu_enforce_isolation) {
> +		case -1:
> +		case 0:
> +		default:
> +			/* disable */
> +			adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
> +			break;
> +		case 1:
> +			/* enable */
> +			adev->enforce_isolation[i] =
> +				AMDGPU_ENFORCE_ISOLATION_ENABLE;
> +			break;
> +		case 2:
> +			/* enable legacy mode */
> +			adev->enforce_isolation[i] =
> +				AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
> +			break;
> +		}
> +	}
>   
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d7b27b7a0d519..5132003d85a29 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -179,7 +179,7 @@ uint amdgpu_pg_mask = 0xffffffff;
>   uint amdgpu_sdma_phase_quantum = 32;
>   char *amdgpu_disable_cu;
>   char *amdgpu_virtual_display;
> -bool enforce_isolation;
> +int amdgpu_enforce_isolation = -1;
>   int amdgpu_modeset = -1;
>   
>   /* Specifies the default granularity for SVM, used in buffer
> @@ -1038,11 +1038,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444);
>   
>   
>   /**
> - * DOC: enforce_isolation (bool)
> - * enforce process isolation between graphics and compute via using the same reserved vmid.
> + * DOC: enforce_isolation (int)
> + * enforce process isolation between graphics and compute.
> + * (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)
>    */
> -module_param(enforce_isolation, bool, 0444);
> -MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on");
> +module_param_named(enforce_isolation, amdgpu_enforce_isolation, int, 0444);
> +MODULE_PARM_DESC(enforce_isolation,
> +"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)");
>   
>   /**
>    * DOC: modeset (int)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ff53401aae5a4..9b4272fbc470c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1460,6 +1460,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring)
>   		goto err;
>   
>   	job->enforce_isolation = true;
> +	/* always run the cleaner shader */
> +	job->run_cleaner_shader = true;
>   
>   	ib = &job->ibs[0];
>   	for (i = 0; i <= ring->funcs->align_mask; ++i)
> @@ -1591,7 +1593,7 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev,
>    * Provides the sysfs read interface to get the current settings of the 'enforce_isolation'
>    * feature for each GPU partition. Reading from the 'enforce_isolation'
>    * sysfs file returns the isolation settings for all partitions, where '0'
> - * indicates disabled and '1' indicates enabled.
> + * indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode.
>    *
>    * Return: The number of bytes read from the sysfs file.
>    */
> @@ -1626,9 +1628,10 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev,
>    * @count: The size of the input data
>    *
>    * This function allows control over the 'enforce_isolation' feature, which
> - * serializes access to the graphics engine. Writing '1' or '0' to the
> - * 'enforce_isolation' sysfs file enables or disables process isolation for
> - * each partition. The input should specify the setting for all partitions.
> + * serializes access to the graphics engine. Writing '1', '2', or '0' to the
> + * 'enforce_isolation' sysfs file enables (full or legacy) or disables process
> + * isolation for each partition. The input should specify the setting for all
> + * partitions.
>    *
>    * Return: The number of bytes written to the sysfs file.
>    */
> @@ -1665,13 +1668,29 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
>   		return -EINVAL;
>   
>   	for (i = 0; i < num_partitions; i++) {
> -		if (partition_values[i] != 0 && partition_values[i] != 1)
> +		if (partition_values[i] != 0 &&
> +		    partition_values[i] != 1 &&
> +		    partition_values[i] != 2)
>   			return -EINVAL;
>   	}
>   
>   	mutex_lock(&adev->enforce_isolation_mutex);
> -	for (i = 0; i < num_partitions; i++)
> -		adev->enforce_isolation[i] = partition_values[i];
> +	for (i = 0; i < num_partitions; i++) {
> +		switch (partition_values[i]) {
> +		case 0:
> +		default:
> +			adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
> +			break;
> +		case 1:
> +			adev->enforce_isolation[i] =
> +				AMDGPU_ENFORCE_ISOLATION_ENABLE;
> +			break;
> +		case 2:
> +			adev->enforce_isolation[i] =
> +				AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
> +			break;
> +		}
> +	}
>   	mutex_unlock(&adev->enforce_isolation_mutex);
>   
>   	amdgpu_mes_update_enforce_isolation(adev);
> @@ -2026,7 +2045,7 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
>   	bool wait = false;
>   
>   	mutex_lock(&adev->enforce_isolation_mutex);
> -	if (adev->enforce_isolation[idx]) {
> +	if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
>   		/* set the initial values if nothing is set */
>   		if (!adev->gfx.enforce_isolation_jiffies[idx]) {
>   			adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
> @@ -2093,7 +2112,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
>   	amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
>   
>   	mutex_lock(&adev->enforce_isolation_mutex);
> -	if (adev->enforce_isolation[idx]) {
> +	if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
>   		if (adev->kfd.init_complete)
>   			sched_work = true;
>   	}
> @@ -2130,7 +2149,7 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
>   		return;
>   
>   	mutex_lock(&adev->enforce_isolation_mutex);
> -	if (adev->enforce_isolation[idx]) {
> +	if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
>   		if (adev->kfd.init_complete)
>   			sched_work = true;
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index 4c4e087230ac5..359c19de9a5b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -588,7 +588,7 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev)
>   	}
>   	/* alloc a default reserved vmid to enforce isolation */
>   	for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> -		if (adev->enforce_isolation[i])
> +		if (adev->enforce_isolation[i] != AMDGPU_ENFORCE_ISOLATION_DISABLE)
>   			amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
>   	}
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index ce6b9ba967fff..f2c049129661f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -78,6 +78,7 @@ struct amdgpu_job {
>   
>   	/* enforce isolation */
>   	bool			enforce_isolation;
> +	bool			run_cleaner_shader;
>   
>   	uint32_t		num_ibs;
>   	struct amdgpu_ib	ibs[];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 36f2e87161264..38ea64d87a0ac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -768,7 +768,7 @@ int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev)
>   	if (adev->enable_mes && adev->gfx.enable_cleaner_shader) {
>   		mutex_lock(&adev->enforce_isolation_mutex);
>   		for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> -			if (adev->enforce_isolation[i])
> +			if (adev->enforce_isolation[i] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
>   				r |= amdgpu_mes_set_enforce_isolation(adev, i, true);
>   			else
>   				r |= amdgpu_mes_set_enforce_isolation(adev, i, false);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index b5ddfcbbc9fc9..dadf6715b98be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -676,7 +676,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>   	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>   		ring->funcs->emit_wreg;
>   
> -	cleaner_shader_needed = adev->gfx.enable_cleaner_shader &&
> +	cleaner_shader_needed = job->run_cleaner_shader &&
> +		adev->gfx.enable_cleaner_shader &&
>   		ring->funcs->emit_cleaner_shader && job->base.s_fence &&
>   		&job->base.s_fence->scheduled == isolation->spearhead;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 344d32268c3cd..f7aa45775eadb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -724,7 +724,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
>   					mes->event_log_gpu_addr;
>   	}
>   
> -	if (adev->enforce_isolation[0])
> +	if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
>   		mes_set_hw_res_pkt.limit_single_process = 1;
>   
>   	return mes_v11_0_submit_pkt_and_poll_completion(mes,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index be43e19b7b7fa..b0e042a4cea19 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -762,7 +762,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
>   				pipe * (AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE);
>   	}
>   
> -	if (adev->enforce_isolation[0])
> +	if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
>   		mes_set_hw_res_pkt.limit_single_process = 1;
>   
>   	return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index 2893fd5e5d003..fa28c57692b86 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -43,7 +43,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
>   	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
>   	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
>   					sizeof(struct pm4_mes_map_process));
> -	if (adev->enforce_isolation[kfd->node_id])
> +	if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
>   		packet->bitfields2.exec_cleaner_shader = 1;
>   	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
>   	packet->bitfields2.process_quantum = 10;
> @@ -102,7 +102,8 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
>   	memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
>   	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
>   			sizeof(struct pm4_mes_map_process_aldebaran));
> -	if (adev->enforce_isolation[knode->node_id])
> +	if (adev->enforce_isolation[knode->node_id] ==
> +	    AMDGPU_ENFORCE_ISOLATION_ENABLE)
>   		packet->bitfields2.exec_cleaner_shader = 1;
>   	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
>   	packet->bitfields2.process_quantum = 10;
> @@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
>   	 * hws_max_conc_proc has been done in
>   	 * kgd2kfd_device_init().
>   	 */
> -	concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ?
> -			1 : min(pm->dqm->processes_count,
> -			kfd->max_proc_per_quantum);
> +	concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] ==
> +			       AMDGPU_ENFORCE_ISOLATION_ENABLE) ?
> +		1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum);
>   
>   	packet = (struct pm4_mes_runlist *)buffer;
>   


More information about the amd-gfx mailing list