[PATCH v2] drm/xe/migrate: make MI_TLB_INVALIDATE conditional

Thu Aug 14 14:03:12 UTC 2025

On 08/08/2025 18:02, Summers, Stuart wrote:
> On Fri, 2025-08-08 at 12:04 +0100, Matthew Auld wrote:
>> When clearing VRAM we should be able to skip invalidating the TLBs if
>> we
>> are only using the identity map to access VRAM (which is the common
>> case), since no modifications are made to PTEs on the fly. Also since
>> we
>> use huge 1G entries within the identity map, there should be a pretty
>> decent chance that the next packet(s) (if also clears) can avoid a
>> tree
>> walk if we don't shoot down the TLBs, like if we have to process a
>> long
>> stream of clears.
>>
>> For normal moves/copies, we usually always end up with the src or dst
>> being system memory, meaning we can't only rely on the identity map
>> and
>> will also need to emit PTEs and so will always require a TLB flush.
> 
> Hey Matt,
> 
> Definitely looks like a good, interesting change. I'm running a few
> local tests. Can we hold on this merge for a few hours? I'll report
> back as soon as I have that.

Should I be worried here? I didn't see a follow up :)

> 
> Thanks,
> Stuart
> 
>>
>> v2:
>>    - Update commit to explain the situation for normal copies (Matt B)
>>    - Rebase on latest changes
>>
>> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
>> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Reviewed-by: Matthew Brost <matthew.brost at intel.com>
>> ---
>>   drivers/gpu/drm/xe/xe_migrate.c  | 18 +++++++++++-------
>>   drivers/gpu/drm/xe/xe_ring_ops.c | 10 +++++-----
>>   2 files changed, 16 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_migrate.c
>> b/drivers/gpu/drm/xe/xe_migrate.c
>> index 4effe10d815b..356097fd6d2a 100644
>> --- a/drivers/gpu/drm/xe/xe_migrate.c
>> +++ b/drivers/gpu/drm/xe/xe_migrate.c
>> @@ -906,7 +906,7 @@ struct dma_fence *xe_migrate_copy(struct
>> xe_migrate *m,
>>                          goto err;
>>                  }
>>   
>> -               xe_sched_job_add_migrate_flush(job, flush_flags);
>> +               xe_sched_job_add_migrate_flush(job, flush_flags |
>> MI_INVALIDATE_TLB);
>>                  if (!fence) {
>>                          err = xe_sched_job_add_deps(job, src_bo-
>>> ttm.base.resv,
>>                                                     
>> DMA_RESV_USAGE_BOOKKEEP);
>> @@ -1287,11 +1287,13 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>   
>>                  size -= clear_L0;
>>                  /* Preemption is enabled again by the ring ops. */
>> -               if (clear_vram && xe_migrate_allow_identity(clear_L0,
>> &src_it))
>> +               if (clear_vram && xe_migrate_allow_identity(clear_L0,
>> &src_it)) {
>>                          xe_res_next(&src_it, clear_L0);
>> -               else
>> -                       emit_pte(m, bb, clear_L0_pt, clear_vram,
>> clear_only_system_ccs,
>> -                                &src_it, clear_L0, dst);
>> +               } else {
>> +                       emit_pte(m, bb, clear_L0_pt, clear_vram,
>> +                                clear_only_system_ccs, &src_it,
>> clear_L0, dst);
>> +                       flush_flags |= MI_INVALIDATE_TLB;
>> +               }
>>   
>>                  bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
>>                  update_idx = bb->len;
>> @@ -1302,7 +1304,7 @@ struct dma_fence *xe_migrate_clear(struct
>> xe_migrate *m,
>>                  if (xe_migrate_needs_ccs_emit(xe)) {
>>                          emit_copy_ccs(gt, bb, clear_L0_ofs, true,
>>                                        m->cleared_mem_ofs, false,
>> clear_L0);
>> -                       flush_flags = MI_FLUSH_DW_CCS;
>> +                       flush_flags |= MI_FLUSH_DW_CCS;
>>                  }
>>   
>>                  job = xe_bb_create_migration_job(m->q, bb,
>> @@ -1637,6 +1639,8 @@ __xe_migrate_update_pgtables(struct xe_migrate
>> *m,
>>                  goto err_sa;
>>          }
>>   
>> +       xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
>> +
>>          if (ops->pre_commit) {
>>                  pt_update->job = job;
>>                  err = ops->pre_commit(pt_update);
>> @@ -1862,7 +1866,7 @@ static struct dma_fence *xe_migrate_vram(struct
>> xe_migrate *m,
>>                  goto err;
>>          }
>>   
>> -       xe_sched_job_add_migrate_flush(job, 0);
>> +       xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
>>   
>>          mutex_lock(&m->job_mutex);
>>          xe_sched_job_arm(job);
>> diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c
>> b/drivers/gpu/drm/xe/xe_ring_ops.c
>> index e8f22ec5f9af..80dffc3ba036 100644
>> --- a/drivers/gpu/drm/xe/xe_ring_ops.c
>> +++ b/drivers/gpu/drm/xe/xe_ring_ops.c
>> @@ -110,10 +110,10 @@ static int emit_bb_start(u64 batch_addr, u32
>> ppgtt_flag, u32 *dw, int i)
>>          return i;
>>   }
>>   
>> -static int emit_flush_invalidate(u32 addr, u32 val, u32 *dw, int i)
>> +static int emit_flush_invalidate(u32 addr, u32 val, u32 flush_flags,
>> u32 *dw, int i)
>>   {
>> -       dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB |
>> MI_FLUSH_DW_OP_STOREDW |
>> -                 MI_FLUSH_IMM_DW;
>> +       dw[i++] = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW |
>> +                 MI_FLUSH_IMM_DW | (flush_flags & MI_INVALIDATE_TLB)
>> ?: 0;
>>   
>>          dw[i++] = addr | MI_FLUSH_DW_USE_GTT;
>>          dw[i++] = 0;
>> @@ -410,13 +410,13 @@ static void emit_migration_job_gen12(struct
>> xe_sched_job *job,
>>          i = emit_bb_start(job->ptrs[0].batch_addr, BIT(8), dw, i);
>>   
>>          dw[i++] = preparser_disable(true);
>> -       i = emit_flush_invalidate(saddr, seqno, dw, i);
>> +       i = emit_flush_invalidate(saddr, seqno, job-
>>> migrate_flush_flags, dw, i);
>>          dw[i++] = preparser_disable(false);
>>   
>>          i = emit_bb_start(job->ptrs[1].batch_addr, BIT(8), dw, i);
>>   
>>          i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno,
>> -                               MI_INVALIDATE_TLB | job-
>>> migrate_flush_flags,
>> +                               job->migrate_flush_flags,
>>                                  dw, i);
>>   
>>          i = emit_user_interrupt(dw, i);
>