[PATCH 3/3] drm/amdgpu: adjust enforce_isolation handling

Alex Deucher alexdeucher at gmail.com
Wed Apr 9 14:42:47 UTC 2025


On Wed, Apr 9, 2025 at 10:36 AM SRINIVASAN SHANMUGAM
<srinivasan.shanmugam at amd.com> wrote:
>
>
> On 4/8/2025 9:30 PM, Alex Deucher wrote:
> > Switch from a bool to an enum and allow more options
> > for enforce isolation.  There are now 3 modes of operation:
> > - Disabled (0)
> > - Enabled (serialization and cleaner shader) (1)
> > - Enabled in legacy mode (no serialization or cleaner shader) (2)
> > This provides better flexibility for more use cases.
> >
> > Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h           | 11 +++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 16 +++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 22 ++++++++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       | 12 +++---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c       | 39 ++++++++++++++-----
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c       |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h       |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  3 +-
> >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  2 +-
> >   drivers/gpu/drm/amd/amdgpu/mes_v12_0.c        |  2 +-
> >   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    | 11 +++---
> >   12 files changed, 93 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index 804d377037095..468c6ce09e3ef 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -230,7 +230,7 @@ extern int amdgpu_force_asic_type;
> >   extern int amdgpu_smartshift_bias;
> >   extern int amdgpu_use_xgmi_p2p;
> >   extern int amdgpu_mtype_local;
> > -extern bool enforce_isolation;
> > +extern int amdgpu_enforce_isolation;
> >   #ifdef CONFIG_HSA_AMD
> >   extern int sched_policy;
> >   extern bool debug_evictions;
> > @@ -873,6 +873,13 @@ struct amdgpu_init_level {
> >   struct amdgpu_reset_domain;
> >   struct amdgpu_fru_info;
> >
> > +enum amdgpu_enforce_isolation_mode {
> > +     AMDGPU_ENFORCE_ISOLATION_DISABLE = 0,
> > +     AMDGPU_ENFORCE_ISOLATION_ENABLE = 1,
> > +     AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY = 2,
> > +};
> > +
> > +
> >   /*
> >    * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise.
> >    */
> > @@ -1224,7 +1231,7 @@ struct amdgpu_device {
> >
> >       /* Protection for the following isolation structure */
> >       struct mutex                    enforce_isolation_mutex;
> > -     bool                            enforce_isolation[MAX_XCP];
> > +     enum amdgpu_enforce_isolation_mode      enforce_isolation[MAX_XCP];
> >       struct amdgpu_isolation {
> >               void                    *owner;
> >               struct dma_fence        *spearhead;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > index 4a5cd7a111fc4..34a688fc5bf35 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > @@ -296,7 +296,21 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
> >                                      num_ibs[i], &p->jobs[i]);
> >               if (ret)
> >                       goto free_all_kdata;
> > -             p->jobs[i]->enforce_isolation = p->adev->enforce_isolation[fpriv->xcp_id];
> > +             switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
> > +             case AMDGPU_ENFORCE_ISOLATION_DISABLE:
> > +             default:
> > +                     p->jobs[i]->enforce_isolation = false;
> > +                     p->jobs[i]->run_cleaner_shader = false;
> > +                     break;
> > +             case AMDGPU_ENFORCE_ISOLATION_ENABLE:
> > +                     p->jobs[i]->enforce_isolation = true;
> > +                     p->jobs[i]->run_cleaner_shader = true;
> > +                     break;
> > +             case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
> > +                     p->jobs[i]->enforce_isolation = true;
> > +                     p->jobs[i]->run_cleaner_shader = false;
>
>
> Hi Alex,
>
> Even for legacy enforce_isolation, just to double check, we expect
> cleaner shader to run, cz by default when we trigger modprobe amdgpu
> enforce_isolation = 1, for this usecase, we expect cleaner shader to be
> triggered that means legacy , am I correct pls?

The idea is that this option brings back the enforce isolation
behavior prior to adding the cleaner shader functionality if you
select 2.  So no cleaner shader and no serialization with KFD; just
serialization within KFD.  So:
0 - enforce isolation disabled
1 - enable serialization between KFD and KGD and within KFD and KGD,
cleaner shader between processes
2 - enable serialization within KGD (the original enforce isolation behavior)

Alex

>
> Best regards,
> Srini
>
>
> > +                     break;
> > +             }
> >       }
> >       p->gang_leader = p->jobs[p->gang_leader_idx];
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 8b43f350447a9..700304bbe3982 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -2115,8 +2115,26 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
> >
> >       adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
> >
> > -     for (i = 0; i < MAX_XCP; i++)
> > -             adev->enforce_isolation[i] = !!enforce_isolation;
> > +     for (i = 0; i < MAX_XCP; i++) {
> > +             switch (amdgpu_enforce_isolation) {
> > +             case -1:
> > +             case 0:
> > +             default:
> > +                     /* disable */
> > +                     adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
> > +                     break;
> > +             case 1:
> > +                     /* enable */
> > +                     adev->enforce_isolation[i] =
> > +                             AMDGPU_ENFORCE_ISOLATION_ENABLE;
> > +                     break;
> > +             case 2:
> > +                     /* enable legacy mode */
> > +                     adev->enforce_isolation[i] =
> > +                             AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
> > +                     break;
> > +             }
> > +     }
> >
> >       return 0;
> >   }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index d7b27b7a0d519..5132003d85a29 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -179,7 +179,7 @@ uint amdgpu_pg_mask = 0xffffffff;
> >   uint amdgpu_sdma_phase_quantum = 32;
> >   char *amdgpu_disable_cu;
> >   char *amdgpu_virtual_display;
> > -bool enforce_isolation;
> > +int amdgpu_enforce_isolation = -1;
> >   int amdgpu_modeset = -1;
> >
> >   /* Specifies the default granularity for SVM, used in buffer
> > @@ -1038,11 +1038,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444);
> >
> >
> >   /**
> > - * DOC: enforce_isolation (bool)
> > - * enforce process isolation between graphics and compute via using the same reserved vmid.
> > + * DOC: enforce_isolation (int)
> > + * enforce process isolation between graphics and compute.
> > + * (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)
> >    */
> > -module_param(enforce_isolation, bool, 0444);
> > -MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on");
> > +module_param_named(enforce_isolation, amdgpu_enforce_isolation, int, 0444);
> > +MODULE_PARM_DESC(enforce_isolation,
> > +"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)");
> >
> >   /**
> >    * DOC: modeset (int)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index ff53401aae5a4..9b4272fbc470c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -1460,6 +1460,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring)
> >               goto err;
> >
> >       job->enforce_isolation = true;
> > +     /* always run the cleaner shader */
> > +     job->run_cleaner_shader = true;
> >
> >       ib = &job->ibs[0];
> >       for (i = 0; i <= ring->funcs->align_mask; ++i)
> > @@ -1591,7 +1593,7 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev,
> >    * Provides the sysfs read interface to get the current settings of the 'enforce_isolation'
> >    * feature for each GPU partition. Reading from the 'enforce_isolation'
> >    * sysfs file returns the isolation settings for all partitions, where '0'
> > - * indicates disabled and '1' indicates enabled.
> > + * indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode.
> >    *
> >    * Return: The number of bytes read from the sysfs file.
> >    */
> > @@ -1626,9 +1628,10 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev,
> >    * @count: The size of the input data
> >    *
> >    * This function allows control over the 'enforce_isolation' feature, which
> > - * serializes access to the graphics engine. Writing '1' or '0' to the
> > - * 'enforce_isolation' sysfs file enables or disables process isolation for
> > - * each partition. The input should specify the setting for all partitions.
> > + * serializes access to the graphics engine. Writing '1', '2', or '0' to the
> > + * 'enforce_isolation' sysfs file enables (full or legacy) or disables process
> > + * isolation for each partition. The input should specify the setting for all
> > + * partitions.
> >    *
> >    * Return: The number of bytes written to the sysfs file.
> >    */
> > @@ -1665,13 +1668,29 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev,
> >               return -EINVAL;
> >
> >       for (i = 0; i < num_partitions; i++) {
> > -             if (partition_values[i] != 0 && partition_values[i] != 1)
> > +             if (partition_values[i] != 0 &&
> > +                 partition_values[i] != 1 &&
> > +                 partition_values[i] != 2)
> >                       return -EINVAL;
> >       }
> >
> >       mutex_lock(&adev->enforce_isolation_mutex);
> > -     for (i = 0; i < num_partitions; i++)
> > -             adev->enforce_isolation[i] = partition_values[i];
> > +     for (i = 0; i < num_partitions; i++) {
> > +             switch (partition_values[i]) {
> > +             case 0:
> > +             default:
> > +                     adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE;
> > +                     break;
> > +             case 1:
> > +                     adev->enforce_isolation[i] =
> > +                             AMDGPU_ENFORCE_ISOLATION_ENABLE;
> > +                     break;
> > +             case 2:
> > +                     adev->enforce_isolation[i] =
> > +                             AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY;
> > +                     break;
> > +             }
> > +     }
> >       mutex_unlock(&adev->enforce_isolation_mutex);
> >
> >       amdgpu_mes_update_enforce_isolation(adev);
> > @@ -2026,7 +2045,7 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
> >       bool wait = false;
> >
> >       mutex_lock(&adev->enforce_isolation_mutex);
> > -     if (adev->enforce_isolation[idx]) {
> > +     if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
> >               /* set the initial values if nothing is set */
> >               if (!adev->gfx.enforce_isolation_jiffies[idx]) {
> >                       adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
> > @@ -2093,7 +2112,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
> >       amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
> >
> >       mutex_lock(&adev->enforce_isolation_mutex);
> > -     if (adev->enforce_isolation[idx]) {
> > +     if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
> >               if (adev->kfd.init_complete)
> >                       sched_work = true;
> >       }
> > @@ -2130,7 +2149,7 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring)
> >               return;
> >
> >       mutex_lock(&adev->enforce_isolation_mutex);
> > -     if (adev->enforce_isolation[idx]) {
> > +     if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) {
> >               if (adev->kfd.init_complete)
> >                       sched_work = true;
> >       }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> > index 4c4e087230ac5..359c19de9a5b9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> > @@ -588,7 +588,7 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev)
> >       }
> >       /* alloc a default reserved vmid to enforce isolation */
> >       for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> > -             if (adev->enforce_isolation[i])
> > +             if (adev->enforce_isolation[i] != AMDGPU_ENFORCE_ISOLATION_DISABLE)
> >                       amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i));
> >       }
> >   }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > index ce6b9ba967fff..f2c049129661f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > @@ -78,6 +78,7 @@ struct amdgpu_job {
> >
> >       /* enforce isolation */
> >       bool                    enforce_isolation;
> > +     bool                    run_cleaner_shader;
> >
> >       uint32_t                num_ibs;
> >       struct amdgpu_ib        ibs[];
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > index 36f2e87161264..38ea64d87a0ac 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > @@ -768,7 +768,7 @@ int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev)
> >       if (adev->enable_mes && adev->gfx.enable_cleaner_shader) {
> >               mutex_lock(&adev->enforce_isolation_mutex);
> >               for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) {
> > -                     if (adev->enforce_isolation[i])
> > +                     if (adev->enforce_isolation[i] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> >                               r |= amdgpu_mes_set_enforce_isolation(adev, i, true);
> >                       else
> >                               r |= amdgpu_mes_set_enforce_isolation(adev, i, false);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index b5ddfcbbc9fc9..dadf6715b98be 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -676,7 +676,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
> >       pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
> >               ring->funcs->emit_wreg;
> >
> > -     cleaner_shader_needed = adev->gfx.enable_cleaner_shader &&
> > +     cleaner_shader_needed = job->run_cleaner_shader &&
> > +             adev->gfx.enable_cleaner_shader &&
> >               ring->funcs->emit_cleaner_shader && job->base.s_fence &&
> >               &job->base.s_fence->scheduled == isolation->spearhead;
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > index 344d32268c3cd..f7aa45775eadb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > @@ -724,7 +724,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)
> >                                       mes->event_log_gpu_addr;
> >       }
> >
> > -     if (adev->enforce_isolation[0])
> > +     if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> >               mes_set_hw_res_pkt.limit_single_process = 1;
> >
> >       return mes_v11_0_submit_pkt_and_poll_completion(mes,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > index be43e19b7b7fa..b0e042a4cea19 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
> > @@ -762,7 +762,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
> >                               pipe * (AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE);
> >       }
> >
> > -     if (adev->enforce_isolation[0])
> > +     if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> >               mes_set_hw_res_pkt.limit_single_process = 1;
> >
> >       return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> > index 2893fd5e5d003..fa28c57692b86 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> > @@ -43,7 +43,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
> >       memset(buffer, 0, sizeof(struct pm4_mes_map_process));
> >       packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
> >                                       sizeof(struct pm4_mes_map_process));
> > -     if (adev->enforce_isolation[kfd->node_id])
> > +     if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE)
> >               packet->bitfields2.exec_cleaner_shader = 1;
> >       packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
> >       packet->bitfields2.process_quantum = 10;
> > @@ -102,7 +102,8 @@ static int pm_map_process_aldebaran(struct packet_manager *pm,
> >       memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran));
> >       packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
> >                       sizeof(struct pm4_mes_map_process_aldebaran));
> > -     if (adev->enforce_isolation[knode->node_id])
> > +     if (adev->enforce_isolation[knode->node_id] ==
> > +         AMDGPU_ENFORCE_ISOLATION_ENABLE)
> >               packet->bitfields2.exec_cleaner_shader = 1;
> >       packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
> >       packet->bitfields2.process_quantum = 10;
> > @@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
> >        * hws_max_conc_proc has been done in
> >        * kgd2kfd_device_init().
> >        */
> > -     concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ?
> > -                     1 : min(pm->dqm->processes_count,
> > -                     kfd->max_proc_per_quantum);
> > +     concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] ==
> > +                            AMDGPU_ENFORCE_ISOLATION_ENABLE) ?
> > +             1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum);
> >
> >       packet = (struct pm4_mes_runlist *)buffer;
> >


More information about the amd-gfx mailing list