[PATCH 3/3] drm/amdgpu: adjust enforce_isolation handling
SRINIVASAN SHANMUGAM
srinivasan.shanmugam at amd.com
Wed Apr 9 14:35:55 UTC 2025
On 4/8/2025 9:30 PM, Alex Deucher wrote:
> Switch from a bool to an enum and allow more options
> for enforce isolation. There are now 3 modes of operation:
> - Disabled (0)
> - Enabled (serialization and cleaner shader) (1)
> - Enabled in legacy mode (no serialization or cleaner shader) (2)
> This provides better flexibility for more use cases.
>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 +++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 +++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 39 ++++++++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +-
> drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 +-
> .../drm/amd/amdkfd/kfd_packet_manager_v9.c | 11 +++---
> 12 files changed, 93 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 804d377037095..468c6ce09e3ef 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -230,7 +230,7 @@ extern int amdgpu_force_asic_type;
> extern int amdgpu_smartshift_bias;
> extern int amdgpu_use_xgmi_p2p;
> extern int amdgpu_mtype_local;
> -extern bool enforce_isolation;
> +extern int amdgpu_enforce_isolation;
> #ifdef CONFIG_HSA_AMD
> extern int sched_policy;
> extern bool debug_evictions;
> @@ -873,6 +873,13 @@ struct amdgpu_init_level {
> struct amdgpu_reset_domain;
> struct amdgpu_fru_info;
>
> +enum amdgpu_enforce_isolation_mode {
> + AMDGPU_ENFORCE_ISOLATION_DISABLE = 0,
> + AMDGPU_ENFORCE_ISOLATION_ENABLE = 1,
> + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY = 2,
> +};
> +
> +
> /*
> * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
> */
> @@ -1224,7 +1231,7 @@ struct amdgpu_device {
>
> /* Protection for the following isolation structure */
> struct mutex enforce_isolation_mutex;
> - bool enforce_isolation[MAX_XCP];
> + enum amdgpu_enforce_isolation_mode enforce_isolation[MAX_XCP];
> struct amdgpu_isolation {
> void *owner;
> struct dma_fence *spearhead;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 4a5cd7a111fc4..34a688fc5bf35 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -296,7 +296,21 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
> num_ibs[i], &p->jobs[i]);
> if (ret)
> goto free_all_kdata;
> - p->jobs[i]->enforce_isolation = p->adev->enforce_isolation[fpriv->xcp_id];
> + switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
> + case AMDGPU_ENFORCE_ISOLATION_DISABLE:
> + default:
> + p->jobs[i]->enforce_isolation = false;
> + p->jobs[i]->run_cleaner_shader = false;
> + break;
> + case AMDGPU_ENFORCE_ISOLATION_ENABLE:
> + p->jobs[i]->enforce_isolation = true;
> + p->jobs[i]->run_cleaner_shader = true;
> + break;
> + case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
> + p->jobs[i]->enforce_isolation = true;
> + p->jobs[i]->run_cleaner_shader = false;
Hi Alex,
Even for legacy enforce_isolation, just to double check, we expect
cleaner shader to run, cz by default when we trigger modprobe amdgpu
enforce_isolation = 1, for this usecase, we expect cleaner shader to be
triggered that means legacy , am I correct pls?
Best regards,
Srini
> + break;
> + }
> }
> p->gang_leader = p->jobs[p->gang_leader_idx];
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8b43f350447a9..700304bbe3982 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2115,8 +2115,26 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
>
> adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
>
> - for (i = 0; i < MAX_XCP; i++)
> - adev->enforce_isolation[i] = !!enforce_isolation;
> + for (i = 0; i < MAX_XCP; i++) {
> + switch (amdgpu_enforce_isolation) {
> + case -1:
> + case 0:
> + default:
> + /* disable */
> + adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
> + break;
> + case 1:
> + /* enable */
> + adev->enforce_isolation[i] =
> + AMDGPU_ENFORCE_ISOLATION_ENABLE;
> + break;
> + case 2:
> + /* enable legacy mode */
> + adev->enforce_isolation[i] =
> + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
> + break;
> + }
> + }
>
> return 0;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d7b27b7a0d519..5132003d85a29 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -179,7 +179,7 @@ uint amdgpu_pg_mask = 0xffffffff;
> uint amdgpu_sdma_phase_quantum = 32;
> char *amdgpu_disable_cu;
> char *amdgpu_virtual_display;
> -bool enforce_isolation;
> +int amdgpu_enforce_isolation = -1;
> int amdgpu_modeset = -1;
>
> /* Specifies the default granularity for SVM, used in buffer
> @@ -1038,11 +1038,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444);
>
>
> /**
> - * DOC: enforce_isolation (bool)
> - * enforce process isolation between graphics and compute via using the same reserved vmid.
> + * DOC: enforce_isolation (int)
> + * enforce process isolation between graphics and compute.
> + * (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)
> */
> -module_param(enforce_isolation, bool, 0444);
> -MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on");
> +module_param_named(enforce_isolation, amdgpu_enforce_isolation, int, 0444);
> +MODULE_PARM_DESC(enforce_isolation,
> +"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)");
>
> /**
> * DOC: modeset (int)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ff53401aae5a4..9b4272fbc470c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1460,6 +1460,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring)
> goto err;
>
> job->enforce_isolation = true;
> + /* always run the cleaner shader */
> + job->run_cleaner_shader = true;
>
> ib = &job->ibs[0];
> for (i = 0; i <= ring->funcs->align_mask; ++i)
> @@ -1591,7 +1593,7 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev,
> * Provides the sysfs read interface to get the current settings of the 'enforce_isolation'
> * feature for each GPU partition. Reading from the 'enforce_isolation'
> * sysfs file returns the isolation settings for all partitions, where '0'
> - * indicates disabled and '1' indicates enabled.
> + * indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode.
> *
> * Return: The number of bytes read from the sysfs file.
> */
> @@ -1626,9 +1628,10 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev,
> * @count: The size of the input data
> *
> * This function allows control over the 'enforce_isolation' feature, which
> - * serializes access to the graphics engine. Writing '1' or '0' to the
> - * 'enforce_isolation' sysfs file enables or disables process isolation for
> - * each partition. The input should specify the setting for all partitions.
> + * serializes access to the graphics engine. Writing '1', '2', or '0' to the
> + * 'enforce_isolation' sysfs file enables (full or legacy) or disables process
> + * isolation for each partition. The input should specify the setting for all
> + * partitions.
> *
> * Return: The number of bytes written to the sysfs file.
> */
> @@ -1665,13 +1668,29 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
> return -EINVAL;
>
> for (i = 0; i < num_partitions; i++) {
> - if (partition_values[i] != 0 && partition_values[i] != 1)
> + if (partition_values[i] != 0 &&
> + partition_values[i] != 1 &&
> + partition_values[i] != 2)
> return -EINVAL;
> }
>
> mutex_lock(&adev->enforce_isolation_mutex);
> - for (i = 0; i < num_partitions; i++)
> - adev->enforce_isolation[i] = partition_values[i];
> + for (i = 0; i < num_partitions; i++) {
> + switch (partition_values[i]) {
> + case 0:
> + default:
> + adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
> + break;
> + case 1:
> + adev->enforce_isolation[i] =
> + AMDGPU_ENFORCE_ISOLATION_ENABLE;
> + break;
> + case 2:
> + adev->enforce_isolation[i] =
> + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
> + break;
> + }
> + }
> mutex_unlock(&adev->enforce_isolation_mutex);
>
> amdgpu_mes_update_enforce_isolation(adev);
> @@ -2026,7 +2045,7 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
> bool wait = false;
>
> mutex_lock(&adev->enforce_isolation_mutex);
> - if (adev->enforce_isolation[idx]) {
> + if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
> /* set the initial values if nothing is set */
> if (!adev->gfx.enforce_isolation_jiffies[idx]) {
> adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
> @@ -2093,7 +2112,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
> amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
>
> mutex_lock(&adev->enforce_isolation_mutex);
> - if (adev->enforce_isolation[idx]) {
> + if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
> if (adev->kfd.init_complete)
> sched_work = true;
> }
> @@ -2130,7 +2149,7 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
> return;
>
> mutex_lock(&adev->enforce_isolation_mutex);
> - if (adev->enforce_isolation[idx]) {
> + if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
> if (adev->kfd.init_complete)
> sched_work = true;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index 4c4e087230ac5..359c19de9a5b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -588,7 +588,7 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev)
> }
> /* alloc a default reserved vmid to enforce isolation */
> for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> - if (adev->enforce_isolation[i])
> + if (adev->enforce_isolation[i] != AMDGPU_ENFORCE_ISOLATION_DISABLE)
> amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
> }
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index ce6b9ba967fff..f2c049129661f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -78,6 +78,7 @@ struct amdgpu_job {
>
> /* enforce isolation */
> bool enforce_isolation;
> + bool run_cleaner_shader;
>
> uint32_t num_ibs;
> struct amdgpu_ib ibs[];
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 36f2e87161264..38ea64d87a0ac 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -768,7 +768,7 @@ int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev)
> if (adev->enable_mes && adev->gfx.enable_cleaner_shader) {
> mutex_lock(&adev->enforce_isolation_mutex);
> for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> - if (adev->enforce_isolation[i])
> + if (adev->enforce_isolation[i] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> r |= amdgpu_mes_set_enforce_isolation(adev, i, true);
> else
> r |= amdgpu_mes_set_enforce_isolation(adev, i, false);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index b5ddfcbbc9fc9..dadf6715b98be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -676,7 +676,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
> ring->funcs->emit_wreg;
>
> - cleaner_shader_needed = adev->gfx.enable_cleaner_shader &&
> + cleaner_shader_needed = job->run_cleaner_shader &&
> + adev->gfx.enable_cleaner_shader &&
> ring->funcs->emit_cleaner_shader && job->base.s_fence &&
> &job->base.s_fence->scheduled == isolation->spearhead;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> index 344d32268c3cd..f7aa45775eadb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> @@ -724,7 +724,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
> mes->event_log_gpu_addr;
> }
>
> - if (adev->enforce_isolation[0])
> + if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> mes_set_hw_res_pkt.limit_single_process = 1;
>
> return mes_v11_0_submit_pkt_and_poll_completion(mes,
> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> index be43e19b7b7fa..b0e042a4cea19 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> @@ -762,7 +762,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
> pipe * (AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE);
> }
>
> - if (adev->enforce_isolation[0])
> + if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> mes_set_hw_res_pkt.limit_single_process = 1;
>
> return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index 2893fd5e5d003..fa28c57692b86 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -43,7 +43,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
> memset(buffer, 0, sizeof(struct pm4_mes_map_process));
> packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
> sizeof(struct pm4_mes_map_process));
> - if (adev->enforce_isolation[kfd->node_id])
> + if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> packet->bitfields2.exec_cleaner_shader = 1;
> packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
> packet->bitfields2.process_quantum = 10;
> @@ -102,7 +102,8 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
> memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
> packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
> sizeof(struct pm4_mes_map_process_aldebaran));
> - if (adev->enforce_isolation[knode->node_id])
> + if (adev->enforce_isolation[knode->node_id] ==
> + AMDGPU_ENFORCE_ISOLATION_ENABLE)
> packet->bitfields2.exec_cleaner_shader = 1;
> packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
> packet->bitfields2.process_quantum = 10;
> @@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
> * hws_max_conc_proc has been done in
> * kgd2kfd_device_init().
> */
> - concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ?
> - 1 : min(pm->dqm->processes_count,
> - kfd->max_proc_per_quantum);
> + concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] ==
> + AMDGPU_ENFORCE_ISOLATION_ENABLE) ?
> + 1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum);
>
> packet = (struct pm4_mes_runlist *)buffer;
>
More information about the amd-gfx
mailing list