[PATCH 03/15] drm/xe: CPU binds for jobs

Thu Jun 5 16:13:38 UTC 2025

On Thu, Jun 05, 2025 at 05:44:07PM +0200, Thomas Hellström wrote:
> Hi, Matt,
> 
> An early comment:
> 
> Previous concerns have also included:
> 
> 1) If clearing and binding happens on the same exec_queue, GPU binding
> is actually likely to be faster, right since it can be queued without
> waiting for additional dependencies? Do we have any timings from start-
> of-clear to support or debunk this argument.
> 

The cases where the clearing / moving are on the same queue + pipelined are:
- non-SVM pagefaults
- rebinds in exec IOCTL or preempt resume work

The case where we are using different queues:
- User binds

The cases where we use the same queue but would likely still have a GuC
/ HW context switch:
- SVM pagefaults as we need to wait on copy job in the migration, thus
  the bind is not pipelined
- SVM prefetch is same as above

The common case is clearly user binds. SVM pagefaults + prefetch seem
likely more common than non-SVM pagefaults or exec IOCTL rebinds. Let me
see if I can measure difference between CPU and GPU binds for cases
where the GPU might be faster and get back to you.

> 2) Is page-tables in unmappable VRAM something we'd want to support at
> some point.

Do we? This would be an entire rewrite of our binding code as we always
use the CPU to populate PTEs that not part of the current page table
structure. Likewise, zapping PTEs is always done via the CPU too. This
would a signicantly larger change than anything purposed here and IMO
really out of scope as this change in minor compared supporting
unmappable VRAM PTEs.

Matt

> 
> Thanks,
> Thomas
> 
> 
> On Thu, 2025-06-05 at 08:32 -0700, Matthew Brost wrote:
> > No reason to use the GPU for binds. In run_job, use the CPU to
> > perform
> > binds once the bind job's dependencies are resolved.
> > 
> > Benefits of CPU-based binds:
> > - Lower latency once dependencies are resolved, as there is no
> >   interaction with the GuC or a hardware context switch both of which
> >   are relatively slow.
> > - Large arrays of binds do not risk running out of migration PTEs,
> >   avoiding -ENOBUFS being returned to userspace.
> > - Kernel binds are decoupled from the migration exec queue (which
> > issues
> >   copies and clears), so they cannot get stuck behind unrelated
> >   jobs—this can be a problem with parallel GPU faults.
> > - Enables ULLS on the migration exec queue, as this queue has
> > exclusive
> >   access to the paging copy engine.
> > 
> > The basic idea of the implementation is to store the VM page table
> > update operations (struct xe_vm_pgtable_update_op *pt_op) and
> > additional
> > arguments for the migrate layer’s CPU PTE update function in a job.
> > The
> > submission backend can then call into the migrate layer using the CPU
> > to
> > write the PTEs and free the stored resources for the PTE update.
> > 
> > PT job submission is implemented in the GuC backend for simplicity. A
> > follow-up could introduce a specific backend for PT jobs.
> > 
> > All code related to GPU-based binding has been removed.
> > 
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_bo.c              |   7 +-
> >  drivers/gpu/drm/xe/xe_bo.h              |   9 +-
> >  drivers/gpu/drm/xe/xe_bo_types.h        |   2 -
> >  drivers/gpu/drm/xe/xe_drm_client.c      |   3 +-
> >  drivers/gpu/drm/xe/xe_guc_submit.c      |  36 +++-
> >  drivers/gpu/drm/xe/xe_migrate.c         | 251 +++-------------------
> > --
> >  drivers/gpu/drm/xe/xe_migrate.h         |   6 +
> >  drivers/gpu/drm/xe/xe_pt.c              | 188 ++++++++++++++----
> >  drivers/gpu/drm/xe/xe_pt.h              |   5 +-
> >  drivers/gpu/drm/xe/xe_pt_types.h        |  29 ++-
> >  drivers/gpu/drm/xe/xe_sched_job.c       |  78 +++++---
> >  drivers/gpu/drm/xe/xe_sched_job_types.h |  31 ++-
> >  drivers/gpu/drm/xe/xe_vm.c              |  46 ++---
> >  13 files changed, 341 insertions(+), 350 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> > index 61d208c85281..7aa598b584d2 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.c
> > +++ b/drivers/gpu/drm/xe/xe_bo.c
> > @@ -3033,8 +3033,13 @@ void xe_bo_put_commit(struct llist_head
> > *deferred)
> >  	if (!freed)
> >  		return;
> >  
> > -	llist_for_each_entry_safe(bo, next, freed, freed)
> > +	llist_for_each_entry_safe(bo, next, freed, freed) {
> > +		struct xe_vm *vm = bo->vm;
> > +
> >  		drm_gem_object_free(&bo->ttm.base.refcount);
> > +		if (bo->flags & XE_BO_FLAG_PUT_VM_ASYNC)
> > +			xe_vm_put(vm);
> > +	}
> >  }
> >  
> >  static void xe_bo_dev_work_func(struct work_struct *work)
> > diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
> > index 02ada1fb8a23..967b1fe92560 100644
> > --- a/drivers/gpu/drm/xe/xe_bo.h
> > +++ b/drivers/gpu/drm/xe/xe_bo.h
> > @@ -46,6 +46,7 @@
> >  #define XE_BO_FLAG_GGTT2		BIT(22)
> >  #define XE_BO_FLAG_GGTT3		BIT(23)
> >  #define XE_BO_FLAG_CPU_ADDR_MIRROR	BIT(24)
> > +#define XE_BO_FLAG_PUT_VM_ASYNC		BIT(25)
> >  
> >  /* this one is trigger internally only */
> >  #define XE_BO_FLAG_INTERNAL_TEST	BIT(30)
> > @@ -319,6 +320,7 @@ void __xe_bo_release_dummy(struct kref *kref);
> >   * @bo: The bo to put.
> >   * @deferred: List to which to add the buffer object if we cannot
> > put, or
> >   * NULL if the function is to put unconditionally.
> > + * @added: BO was added to deferred list
> >   *
> >   * Since the final freeing of an object includes both sleeping and
> > (!)
> >   * memory allocation in the dma_resv individualization, it's not ok
> > @@ -338,7 +340,8 @@ void __xe_bo_release_dummy(struct kref *kref);
> >   * false otherwise.
> >   */
> >  static inline bool
> > -xe_bo_put_deferred(struct xe_bo *bo, struct llist_head *deferred)
> > +xe_bo_put_deferred(struct xe_bo *bo, struct llist_head *deferred,
> > +		   bool *added)
> >  {
> >  	if (!deferred) {
> >  		xe_bo_put(bo);
> > @@ -348,6 +351,7 @@ xe_bo_put_deferred(struct xe_bo *bo, struct
> > llist_head *deferred)
> >  	if (!kref_put(&bo->ttm.base.refcount,
> > __xe_bo_release_dummy))
> >  		return false;
> >  
> > +	*added = true;
> >  	return llist_add(&bo->freed, deferred);
> >  }
> >  
> > @@ -363,8 +367,9 @@ static inline void
> >  xe_bo_put_async(struct xe_bo *bo)
> >  {
> >  	struct xe_bo_dev *bo_device = &xe_bo_device(bo)->bo_device;
> > +	bool added = false;
> >  
> > -	if (xe_bo_put_deferred(bo, &bo_device->async_list))
> > +	if (xe_bo_put_deferred(bo, &bo_device->async_list, &added))
> >  		schedule_work(&bo_device->async_free);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_bo_types.h
> > b/drivers/gpu/drm/xe/xe_bo_types.h
> > index eb5e83c5f233..ecf42a04640a 100644
> > --- a/drivers/gpu/drm/xe/xe_bo_types.h
> > +++ b/drivers/gpu/drm/xe/xe_bo_types.h
> > @@ -70,8 +70,6 @@ struct xe_bo {
> >  
> >  	/** @freed: List node for delayed put. */
> >  	struct llist_node freed;
> > -	/** @update_index: Update index if PT BO */
> > -	int update_index;
> >  	/** @created: Whether the bo has passed initial creation */
> >  	bool created;
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_drm_client.c
> > b/drivers/gpu/drm/xe/xe_drm_client.c
> > index 31f688e953d7..6f5a91ef7491 100644
> > --- a/drivers/gpu/drm/xe/xe_drm_client.c
> > +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> > @@ -200,6 +200,7 @@ static void show_meminfo(struct drm_printer *p,
> > struct drm_file *file)
> >  	LLIST_HEAD(deferred);
> >  	unsigned int id;
> >  	u32 mem_type;
> > +	bool added = false;
> >  
> >  	client = xef->client;
> >  
> > @@ -246,7 +247,7 @@ static void show_meminfo(struct drm_printer *p,
> > struct drm_file *file)
> >  			xe_assert(xef->xe, !list_empty(&bo-
> > >client_link));
> >  		}
> >  
> > -		xe_bo_put_deferred(bo, &deferred);
> > +		xe_bo_put_deferred(bo, &deferred, &added);
> >  	}
> >  	spin_unlock(&client->bos_lock);
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> > b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 2b61d017eeca..551cd21a6465 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -19,6 +19,7 @@
> >  #include "abi/guc_klvs_abi.h"
> >  #include "regs/xe_lrc_layout.h"
> >  #include "xe_assert.h"
> > +#include "xe_bo.h"
> >  #include "xe_devcoredump.h"
> >  #include "xe_device.h"
> >  #include "xe_exec_queue.h"
> > @@ -38,8 +39,10 @@
> >  #include "xe_lrc.h"
> >  #include "xe_macros.h"
> >  #include "xe_map.h"
> > +#include "xe_migrate.h"
> >  #include "xe_mocs.h"
> >  #include "xe_pm.h"
> > +#include "xe_pt.h"
> >  #include "xe_ring_ops_types.h"
> >  #include "xe_sched_job.h"
> >  #include "xe_trace.h"
> > @@ -745,6 +748,20 @@ static void submit_exec_queue(struct
> > xe_exec_queue *q)
> >  	}
> >  }
> >  
> > +static bool is_pt_job(struct xe_sched_job *job)
> > +{
> > +	return job->is_pt_job;
> > +}
> > +
> > +static void run_pt_job(struct xe_sched_job *job)
> > +{
> > +	__xe_migrate_update_pgtables_cpu(job->pt_update[0].vm,
> > +					 job->pt_update[0].tile,
> > +					 job->pt_update[0].ops,
> > +					 job-
> > >pt_update[0].pt_job_ops->ops,
> > +					 job-
> > >pt_update[0].pt_job_ops->current_op);
> > +}
> > +
> >  static struct dma_fence *
> >  guc_exec_queue_run_job(struct drm_sched_job *drm_job)
> >  {
> > @@ -760,14 +777,21 @@ guc_exec_queue_run_job(struct drm_sched_job
> > *drm_job)
> >  	trace_xe_sched_job_run(job);
> >  
> >  	if (!exec_queue_killed_or_banned_or_wedged(q) &&
> > !xe_sched_job_is_error(job)) {
> > -		if (!exec_queue_registered(q))
> > -			register_exec_queue(q);
> > -		if (!lr)	/* LR jobs are emitted in the exec
> > IOCTL */
> > -			q->ring_ops->emit_job(job);
> > -		submit_exec_queue(q);
> > +		if (is_pt_job(job)) {
> > +			run_pt_job(job);
> > +		} else {
> > +			if (!exec_queue_registered(q))
> > +				register_exec_queue(q);
> > +			if (!lr)	/* LR jobs are emitted in
> > the exec IOCTL */
> > +				q->ring_ops->emit_job(job);
> > +			submit_exec_queue(q);
> > +		}
> >  	}
> >  
> > -	if (lr) {
> > +	if (is_pt_job(job)) {
> > +		xe_pt_job_ops_put(job->pt_update[0].pt_job_ops);
> > +		dma_fence_put(job->fence);	/* Drop ref from
> > xe_sched_job_arm */
> > +	} else if (lr) {
> >  		xe_sched_job_set_error(job, -EOPNOTSUPP);
> >  		dma_fence_put(job->fence);	/* Drop ref from
> > xe_sched_job_arm */
> >  	} else {
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.c
> > b/drivers/gpu/drm/xe/xe_migrate.c
> > index 9084f5cbc02d..e444f3fae97c 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.c
> > +++ b/drivers/gpu/drm/xe/xe_migrate.c
> > @@ -58,18 +58,12 @@ struct xe_migrate {
> >  	 * Protected by @job_mutex.
> >  	 */
> >  	struct dma_fence *fence;
> > -	/**
> > -	 * @vm_update_sa: For integrated, used to suballocate page-
> > tables
> > -	 * out of the pt_bo.
> > -	 */
> > -	struct drm_suballoc_manager vm_update_sa;
> >  	/** @min_chunk_size: For dgfx, Minimum chunk size */
> >  	u64 min_chunk_size;
> >  };
> >  
> >  #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
> >  #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE *
> > (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */
> > -#define NUM_KERNEL_PDE 15
> >  #define NUM_PT_SLOTS 32
> >  #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M
> >  #define MAX_NUM_PTE 512
> > @@ -107,7 +101,6 @@ static void xe_migrate_fini(void *arg)
> >  
> >  	dma_fence_put(m->fence);
> >  	xe_bo_put(m->pt_bo);
> > -	drm_suballoc_manager_fini(&m->vm_update_sa);
> >  	mutex_destroy(&m->job_mutex);
> >  	xe_vm_close_and_put(m->q->vm);
> >  	xe_exec_queue_put(m->q);
> > @@ -199,8 +192,6 @@ static int xe_migrate_prepare_vm(struct xe_tile
> > *tile, struct xe_migrate *m,
> >  	BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
> >  	/* Must be a multiple of 64K to support all platforms */
> >  	BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K);
> > -	/* And one slot reserved for the 4KiB page table updates */
> > -	BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
> >  
> >  	/* Need to be sure everything fits in the first PT, or
> > create more */
> >  	xe_tile_assert(tile, m->batch_base_ofs + batch->size <
> > SZ_2M);
> > @@ -333,8 +324,6 @@ static int xe_migrate_prepare_vm(struct xe_tile
> > *tile, struct xe_migrate *m,
> >  	/*
> >  	 * Example layout created above, with root level = 3:
> >  	 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
> > -	 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
> > -	 * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's
> >  	 * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31
> > = 2M vram identity map]
> >  	 *
> >  	 * This makes the lowest part of the VM point to the
> > pagetables.
> > @@ -342,19 +331,10 @@ static int xe_migrate_prepare_vm(struct xe_tile
> > *tile, struct xe_migrate *m,
> >  	 * and flushes, other parts of the VM can be used either for
> > copying and
> >  	 * clearing.
> >  	 *
> > -	 * For performance, the kernel reserves PDE's, so about 20
> > are left
> > -	 * for async VM updates.
> > -	 *
> >  	 * To make it easier to work, each scratch PT is put in slot
> > (1 + PT #)
> >  	 * everywhere, this allows lockless updates to scratch pages
> > by using
> >  	 * the different addresses in VM.
> >  	 */
> > -#define NUM_VMUSA_UNIT_PER_PAGE	32
> > -#define VM_SA_UPDATE_UNIT_SIZE		(XE_PAGE_SIZE /
> > NUM_VMUSA_UNIT_PER_PAGE)
> > -#define NUM_VMUSA_WRITES_PER_UNIT	(VM_SA_UPDATE_UNIT_SIZE /
> > sizeof(u64))
> > -	drm_suballoc_manager_init(&m->vm_update_sa,
> > -				  (size_t)(map_ofs / XE_PAGE_SIZE -
> > NUM_KERNEL_PDE) *
> > -				  NUM_VMUSA_UNIT_PER_PAGE, 0);
> >  
> >  	m->pt_bo = bo;
> >  	return 0;
> > @@ -1193,56 +1173,6 @@ struct dma_fence *xe_migrate_clear(struct
> > xe_migrate *m,
> >  	return fence;
> >  }
> >  
> > -static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb,
> > u64 ppgtt_ofs,
> > -			  const struct xe_vm_pgtable_update_op
> > *pt_op,
> > -			  const struct xe_vm_pgtable_update *update,
> > -			  struct xe_migrate_pt_update *pt_update)
> > -{
> > -	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
> > -	struct xe_vm *vm = pt_update->vops->vm;
> > -	u32 chunk;
> > -	u32 ofs = update->ofs, size = update->qwords;
> > -
> > -	/*
> > -	 * If we have 512 entries (max), we would populate it
> > ourselves,
> > -	 * and update the PDE above it to the new pointer.
> > -	 * The only time this can only happen if we have to update
> > the top
> > -	 * PDE. This requires a BO that is almost vm->size big.
> > -	 *
> > -	 * This shouldn't be possible in practice.. might change
> > when 16K
> > -	 * pages are used. Hence the assert.
> > -	 */
> > -	xe_tile_assert(tile, update->qwords < MAX_NUM_PTE);
> > -	if (!ppgtt_ofs)
> > -		ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
> > -						xe_bo_addr(update-
> > >pt_bo, 0,
> > -							  
> > XE_PAGE_SIZE), false);
> > -
> > -	do {
> > -		u64 addr = ppgtt_ofs + ofs * 8;
> > -
> > -		chunk = min(size, MAX_PTE_PER_SDI);
> > -
> > -		/* Ensure populatefn can do memset64 by aligning bb-
> > >cs */
> > -		if (!(bb->len & 1))
> > -			bb->cs[bb->len++] = MI_NOOP;
> > -
> > -		bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > MI_SDI_NUM_QW(chunk);
> > -		bb->cs[bb->len++] = lower_32_bits(addr);
> > -		bb->cs[bb->len++] = upper_32_bits(addr);
> > -		if (pt_op->bind)
> > -			ops->populate(tile, NULL, bb->cs + bb->len,
> > -				      ofs, chunk, update);
> > -		else
> > -			ops->clear(vm, tile, NULL, bb->cs + bb->len,
> > -				   ofs, chunk, update);
> > -
> > -		bb->len += chunk * 2;
> > -		ofs += chunk;
> > -		size -= chunk;
> > -	} while (size);
> > -}
> > -
> >  struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)
> >  {
> >  	return xe_vm_get(m->q->vm);
> > @@ -1258,7 +1188,18 @@ struct migrate_test_params {
> >  	container_of(_priv, struct migrate_test_params, base)
> >  #endif
> >  
> > -static void
> > +/**
> > + * __xe_migrate_update_pgtables_cpu() - Update a VM's PTEs via the
> > CPU
> > + * @vm: The VM being updated
> > + * @tile: The tile being updated
> > + * @ops: The migrate PT update ops
> > + * @pt_ops: The VM PT update ops
> > + * @num_ops: The number of The VM PT update ops
> > + *
> > + * Execute the VM PT update ops array which results in a VM's PTEs
> > being updated
> > + * via the CPU.
> > + */
> > +void
> >  __xe_migrate_update_pgtables_cpu(struct xe_vm *vm, struct xe_tile
> > *tile,
> >  				 const struct
> > xe_migrate_pt_update_ops *ops,
> >  				 struct xe_vm_pgtable_update_op
> > *pt_op,
> > @@ -1314,7 +1255,7 @@ xe_migrate_update_pgtables_cpu(struct
> > xe_migrate *m,
> >  	}
> >  
> >  	__xe_migrate_update_pgtables_cpu(vm, m->tile, ops,
> > -					 pt_update_ops->ops,
> > +					 pt_update_ops->pt_job_ops-
> > >ops,
> >  					 pt_update_ops->num_ops);
> >  
> >  	return dma_fence_get_stub();
> > @@ -1327,161 +1268,19 @@ __xe_migrate_update_pgtables(struct
> > xe_migrate *m,
> >  {
> >  	const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
> >  	struct xe_tile *tile = m->tile;
> > -	struct xe_gt *gt = tile->primary_gt;
> > -	struct xe_device *xe = tile_to_xe(tile);
> >  	struct xe_sched_job *job;
> >  	struct dma_fence *fence;
> > -	struct drm_suballoc *sa_bo = NULL;
> > -	struct xe_bb *bb;
> > -	u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs =
> > 0;
> > -	u32 num_updates = 0, current_update = 0;
> > -	u64 addr;
> > -	int err = 0;
> >  	bool is_migrate = pt_update_ops->q == m->q;
> > -	bool usm = is_migrate && xe->info.has_usm;
> > -
> > -	for (i = 0; i < pt_update_ops->num_ops; ++i) {
> > -		struct xe_vm_pgtable_update_op *pt_op =
> > &pt_update_ops->ops[i];
> > -		struct xe_vm_pgtable_update *updates = pt_op-
> > >entries;
> > -
> > -		num_updates += pt_op->num_entries;
> > -		for (j = 0; j < pt_op->num_entries; ++j) {
> > -			u32 num_cmds =
> > DIV_ROUND_UP(updates[j].qwords,
> > -						   
> > MAX_PTE_PER_SDI);
> > -
> > -			/* align noop + MI_STORE_DATA_IMM cmd prefix
> > */
> > -			batch_size += 4 * num_cmds +
> > updates[j].qwords * 2;
> > -		}
> > -	}
> > -
> > -	/* fixed + PTE entries */
> > -	if (IS_DGFX(xe))
> > -		batch_size += 2;
> > -	else
> > -		batch_size += 6 * (num_updates / MAX_PTE_PER_SDI +
> > 1) +
> > -			num_updates * 2;
> > -
> > -	bb = xe_bb_new(gt, batch_size, usm);
> > -	if (IS_ERR(bb))
> > -		return ERR_CAST(bb);
> > -
> > -	/* For sysmem PTE's, need to map them in our hole.. */
> > -	if (!IS_DGFX(xe)) {
> > -		u16 pat_index = xe->pat.idx[XE_CACHE_WB];
> > -		u32 ptes, ofs;
> > -
> > -		ppgtt_ofs = NUM_KERNEL_PDE - 1;
> > -		if (!is_migrate) {
> > -			u32 num_units = DIV_ROUND_UP(num_updates,
> > -						    
> > NUM_VMUSA_WRITES_PER_UNIT);
> > -
> > -			if (num_units > m->vm_update_sa.size) {
> > -				err = -ENOBUFS;
> > -				goto err_bb;
> > -			}
> > -			sa_bo = drm_suballoc_new(&m->vm_update_sa,
> > num_units,
> > -						 GFP_KERNEL, true,
> > 0);
> > -			if (IS_ERR(sa_bo)) {
> > -				err = PTR_ERR(sa_bo);
> > -				goto err_bb;
> > -			}
> > -
> > -			ppgtt_ofs = NUM_KERNEL_PDE +
> > -				(drm_suballoc_soffset(sa_bo) /
> > -				 NUM_VMUSA_UNIT_PER_PAGE);
> > -			page_ofs = (drm_suballoc_soffset(sa_bo) %
> > -				    NUM_VMUSA_UNIT_PER_PAGE) *
> > -				VM_SA_UPDATE_UNIT_SIZE;
> > -		}
> > -
> > -		/* Map our PT's to gtt */
> > -		i = 0;
> > -		j = 0;
> > -		ptes = num_updates;
> > -		ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs;
> > -		while (ptes) {
> > -			u32 chunk = min(MAX_PTE_PER_SDI, ptes);
> > -			u32 idx = 0;
> > -
> > -			bb->cs[bb->len++] = MI_STORE_DATA_IMM |
> > -				MI_SDI_NUM_QW(chunk);
> > -			bb->cs[bb->len++] = ofs;
> > -			bb->cs[bb->len++] = 0; /* upper_32_bits */
> > -
> > -			for (; i < pt_update_ops->num_ops; ++i) {
> > -				struct xe_vm_pgtable_update_op
> > *pt_op =
> > -					&pt_update_ops->ops[i];
> > -				struct xe_vm_pgtable_update *updates
> > = pt_op->entries;
> > -
> > -				for (; j < pt_op->num_entries; ++j,
> > ++current_update, ++idx) {
> > -					struct xe_vm *vm =
> > pt_update->vops->vm;
> > -					struct xe_bo *pt_bo =
> > updates[j].pt_bo;
> > -
> > -					if (idx == chunk)
> > -						goto next_cmd;
> > -
> > -					xe_tile_assert(tile, pt_bo-
> > >size == SZ_4K);
> > -
> > -					/* Map a PT at most once */
> > -					if (pt_bo->update_index < 0)
> > -						pt_bo->update_index
> > = current_update;
> > -
> > -					addr = vm->pt_ops-
> > >pte_encode_bo(pt_bo, 0,
> > -
> > 									 pat_index, 0);
> > -					bb->cs[bb->len++] =
> > lower_32_bits(addr);
> > -					bb->cs[bb->len++] =
> > upper_32_bits(addr);
> > -				}
> > -
> > -				j = 0;
> > -			}
> > -
> > -next_cmd:
> > -			ptes -= chunk;
> > -			ofs += chunk * sizeof(u64);
> > -		}
> > -
> > -		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> > -		update_idx = bb->len;
> > -
> > -		addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +
> > -			(page_ofs / sizeof(u64)) * XE_PAGE_SIZE;
> > -		for (i = 0; i < pt_update_ops->num_ops; ++i) {
> > -			struct xe_vm_pgtable_update_op *pt_op =
> > -				&pt_update_ops->ops[i];
> > -			struct xe_vm_pgtable_update *updates =
> > pt_op->entries;
> > -
> > -			for (j = 0; j < pt_op->num_entries; ++j) {
> > -				struct xe_bo *pt_bo =
> > updates[j].pt_bo;
> > -
> > -				write_pgtable(tile, bb, addr +
> > -					      pt_bo->update_index *
> > XE_PAGE_SIZE,
> > -					      pt_op, &updates[j],
> > pt_update);
> > -			}
> > -		}
> > -	} else {
> > -		/* phys pages, no preamble required */
> > -		bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
> > -		update_idx = bb->len;
> > -
> > -		for (i = 0; i < pt_update_ops->num_ops; ++i) {
> > -			struct xe_vm_pgtable_update_op *pt_op =
> > -				&pt_update_ops->ops[i];
> > -			struct xe_vm_pgtable_update *updates =
> > pt_op->entries;
> > -
> > -			for (j = 0; j < pt_op->num_entries; ++j)
> > -				write_pgtable(tile, bb, 0, pt_op,
> > &updates[j],
> > -					      pt_update);
> > -		}
> > -	}
> > +	int err;
> >  
> > -	job = xe_bb_create_migration_job(pt_update_ops->q, bb,
> > -					 xe_migrate_batch_base(m,
> > usm),
> > -					 update_idx);
> > +	job = xe_sched_job_create(pt_update_ops->q, NULL);
> >  	if (IS_ERR(job)) {
> >  		err = PTR_ERR(job);
> > -		goto err_sa;
> > +		goto err_out;
> >  	}
> >  
> > +	xe_tile_assert(tile, job->is_pt_job);
> > +
> >  	if (ops->pre_commit) {
> >  		pt_update->job = job;
> >  		err = ops->pre_commit(pt_update);
> > @@ -1491,6 +1290,12 @@ __xe_migrate_update_pgtables(struct xe_migrate
> > *m,
> >  	if (is_migrate)
> >  		mutex_lock(&m->job_mutex);
> >  
> > +	job->pt_update[0].vm = pt_update->vops->vm;
> > +	job->pt_update[0].tile = tile;
> > +	job->pt_update[0].ops = ops;
> > +	job->pt_update[0].pt_job_ops =
> > +		xe_pt_job_ops_get(pt_update_ops->pt_job_ops);
> > +
> >  	xe_sched_job_arm(job);
> >  	fence = dma_fence_get(&job->drm.s_fence->finished);
> >  	xe_sched_job_push(job);
> > @@ -1498,17 +1303,11 @@ __xe_migrate_update_pgtables(struct
> > xe_migrate *m,
> >  	if (is_migrate)
> >  		mutex_unlock(&m->job_mutex);
> >  
> > -	xe_bb_free(bb, fence);
> > -	drm_suballoc_free(sa_bo, fence);
> > -
> >  	return fence;
> >  
> >  err_job:
> >  	xe_sched_job_put(job);
> > -err_sa:
> > -	drm_suballoc_free(sa_bo, NULL);
> > -err_bb:
> > -	xe_bb_free(bb, NULL);
> > +err_out:
> >  	return ERR_PTR(err);
> >  }
> >  
> > diff --git a/drivers/gpu/drm/xe/xe_migrate.h
> > b/drivers/gpu/drm/xe/xe_migrate.h
> > index b064455b604e..0986ffdd8d9a 100644
> > --- a/drivers/gpu/drm/xe/xe_migrate.h
> > +++ b/drivers/gpu/drm/xe/xe_migrate.h
> > @@ -22,6 +22,7 @@ struct xe_pt;
> >  struct xe_tile;
> >  struct xe_vm;
> >  struct xe_vm_pgtable_update;
> > +struct xe_vm_pgtable_update_op;
> >  struct xe_vma;
> >  
> >  /**
> > @@ -125,6 +126,11 @@ struct dma_fence *xe_migrate_clear(struct
> > xe_migrate *m,
> >  
> >  struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m);
> >  
> > +void __xe_migrate_update_pgtables_cpu(struct xe_vm *vm, struct
> > xe_tile *tile,
> > +				      const struct
> > xe_migrate_pt_update_ops *ops,
> > +				      struct xe_vm_pgtable_update_op
> > *pt_op,
> > +				      int num_ops);
> > +
> >  struct dma_fence *
> >  xe_migrate_update_pgtables(struct xe_migrate *m,
> >  			   struct xe_migrate_pt_update *pt_update);
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index db1c363a65d5..1ad31f444b79 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -200,7 +200,9 @@ unsigned int xe_pt_shift(unsigned int level)
> >   * and finally frees @pt. TODO: Can we remove the @flags argument?
> >   */
> >  void xe_pt_destroy(struct xe_pt *pt, u32 flags, struct llist_head
> > *deferred)
> > +
> >  {
> > +	bool added = false;
> >  	int i;
> >  
> >  	if (!pt)
> > @@ -208,7 +210,18 @@ void xe_pt_destroy(struct xe_pt *pt, u32 flags,
> > struct llist_head *deferred)
> >  
> >  	XE_WARN_ON(!list_empty(&pt->bo->ttm.base.gpuva.list));
> >  	xe_bo_unpin(pt->bo);
> > -	xe_bo_put_deferred(pt->bo, deferred);
> > +	xe_bo_put_deferred(pt->bo, deferred, &added);
> > +	if (added) {
> > +		/*
> > +		 * We need the VM present until the BO is destroyed
> > as it shares
> > +		 * a dma-resv and BO destroy is async. Reinit BO
> > refcount so
> > +		 * xe_bo_put_async can be used when the PT job ops
> > refcount goes
> > +		 * to zero.
> > +		 */
> > +		xe_vm_get(pt->bo->vm);
> > +		pt->bo->flags |= XE_BO_FLAG_PUT_VM_ASYNC;
> > +		kref_init(&pt->bo->ttm.base.refcount);
> > +	}
> >  
> >  	if (pt->level > 0 && pt->num_live) {
> >  		struct xe_pt_dir *pt_dir = as_xe_pt_dir(pt);
> > @@ -361,7 +374,7 @@ xe_pt_new_shared(struct xe_walk_update *wupd,
> > struct xe_pt *parent,
> >  	entry->pt = parent;
> >  	entry->flags = 0;
> >  	entry->qwords = 0;
> > -	entry->pt_bo->update_index = -1;
> > +	entry->level = parent->level;
> >  
> >  	if (alloc_entries) {
> >  		entry->pt_entries = kmalloc_array(XE_PDES,
> > @@ -1739,7 +1752,7 @@ xe_migrate_clear_pgtable_callback(struct xe_vm
> > *vm, struct xe_tile *tile,
> >  				  u32 qword_ofs, u32 num_qwords,
> >  				  const struct xe_vm_pgtable_update
> > *update)
> >  {
> > -	u64 empty = __xe_pt_empty_pte(tile, vm, update->pt->level);
> > +	u64 empty = __xe_pt_empty_pte(tile, vm, update->level);
> >  	int i;
> >  
> >  	if (map && map->is_iomem)
> > @@ -1805,13 +1818,20 @@ xe_pt_commit_prepare_unbind(struct xe_vma
> > *vma,
> >  	}
> >  }
> >  
> > +static struct xe_vm_pgtable_update_op *
> > +to_pt_op(struct xe_vm_pgtable_update_ops *pt_update_ops, u32
> > current_op)
> > +{
> > +	return &pt_update_ops->pt_job_ops->ops[current_op];
> > +}
> > +
> >  static void
> >  xe_pt_update_ops_rfence_interval(struct xe_vm_pgtable_update_ops
> > *pt_update_ops,
> >  				 u64 start, u64 end)
> >  {
> >  	u64 last;
> > -	u32 current_op = pt_update_ops->current_op;
> > -	struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops-
> > >ops[current_op];
> > +	u32 current_op = pt_update_ops->pt_job_ops->current_op;
> > +	struct xe_vm_pgtable_update_op *pt_op =
> > +		to_pt_op(pt_update_ops, current_op);
> >  	int i, level = 0;
> >  
> >  	for (i = 0; i < pt_op->num_entries; i++) {
> > @@ -1846,8 +1866,9 @@ static int bind_op_prepare(struct xe_vm *vm,
> > struct xe_tile *tile,
> >  			   struct xe_vm_pgtable_update_ops
> > *pt_update_ops,
> >  			   struct xe_vma *vma, bool
> > invalidate_on_bind)
> >  {
> > -	u32 current_op = pt_update_ops->current_op;
> > -	struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops-
> > >ops[current_op];
> > +	u32 current_op = pt_update_ops->pt_job_ops->current_op;
> > +	struct xe_vm_pgtable_update_op *pt_op =
> > +		to_pt_op(pt_update_ops, current_op);
> >  	int err;
> >  
> >  	xe_tile_assert(tile, !xe_vma_is_cpu_addr_mirror(vma));
> > @@ -1876,7 +1897,7 @@ static int bind_op_prepare(struct xe_vm *vm,
> > struct xe_tile *tile,
> >  		xe_pt_update_ops_rfence_interval(pt_update_ops,
> >  						 xe_vma_start(vma),
> >  						 xe_vma_end(vma));
> > -		++pt_update_ops->current_op;
> > +		++pt_update_ops->pt_job_ops->current_op;
> >  		pt_update_ops->needs_userptr_lock |=
> > xe_vma_is_userptr(vma);
> >  
> >  		/*
> > @@ -1913,8 +1934,9 @@ static int bind_range_prepare(struct xe_vm *vm,
> > struct xe_tile *tile,
> >  			      struct xe_vm_pgtable_update_ops
> > *pt_update_ops,
> >  			      struct xe_vma *vma, struct
> > xe_svm_range *range)
> >  {
> > -	u32 current_op = pt_update_ops->current_op;
> > -	struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops-
> > >ops[current_op];
> > +	u32 current_op = pt_update_ops->pt_job_ops->current_op;
> > +	struct xe_vm_pgtable_update_op *pt_op =
> > +		to_pt_op(pt_update_ops, current_op);
> >  	int err;
> >  
> >  	xe_tile_assert(tile, xe_vma_is_cpu_addr_mirror(vma));
> > @@ -1938,7 +1960,7 @@ static int bind_range_prepare(struct xe_vm *vm,
> > struct xe_tile *tile,
> >  		xe_pt_update_ops_rfence_interval(pt_update_ops,
> >  						 range-
> > >base.itree.start,
> >  						 range-
> > >base.itree.last + 1);
> > -		++pt_update_ops->current_op;
> > +		++pt_update_ops->pt_job_ops->current_op;
> >  		pt_update_ops->needs_svm_lock = true;
> >  
> >  		pt_op->vma = vma;
> > @@ -1955,8 +1977,9 @@ static int unbind_op_prepare(struct xe_tile
> > *tile,
> >  			     struct xe_vm_pgtable_update_ops
> > *pt_update_ops,
> >  			     struct xe_vma *vma)
> >  {
> > -	u32 current_op = pt_update_ops->current_op;
> > -	struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops-
> > >ops[current_op];
> > +	u32 current_op = pt_update_ops->pt_job_ops->current_op;
> > +	struct xe_vm_pgtable_update_op *pt_op =
> > +		to_pt_op(pt_update_ops, current_op);
> >  	int err;
> >  
> >  	if (!((vma->tile_present | vma->tile_staged) & BIT(tile-
> > >id)))
> > @@ -1984,7 +2007,7 @@ static int unbind_op_prepare(struct xe_tile
> > *tile,
> >  				pt_op->num_entries, false);
> >  	xe_pt_update_ops_rfence_interval(pt_update_ops,
> > xe_vma_start(vma),
> >  					 xe_vma_end(vma));
> > -	++pt_update_ops->current_op;
> > +	++pt_update_ops->pt_job_ops->current_op;
> >  	pt_update_ops->needs_userptr_lock |= xe_vma_is_userptr(vma);
> >  	pt_update_ops->needs_invalidation = true;
> >  
> > @@ -1998,8 +2021,9 @@ static int unbind_range_prepare(struct xe_vm
> > *vm,
> >  				struct xe_vm_pgtable_update_ops
> > *pt_update_ops,
> >  				struct xe_svm_range *range)
> >  {
> > -	u32 current_op = pt_update_ops->current_op;
> > -	struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops-
> > >ops[current_op];
> > +	u32 current_op = pt_update_ops->pt_job_ops->current_op;
> > +	struct xe_vm_pgtable_update_op *pt_op =
> > +		to_pt_op(pt_update_ops, current_op);
> >  
> >  	if (!(range->tile_present & BIT(tile->id)))
> >  		return 0;
> > @@ -2019,7 +2043,7 @@ static int unbind_range_prepare(struct xe_vm
> > *vm,
> >  				pt_op->num_entries, false);
> >  	xe_pt_update_ops_rfence_interval(pt_update_ops, range-
> > >base.itree.start,
> >  					 range->base.itree.last +
> > 1);
> > -	++pt_update_ops->current_op;
> > +	++pt_update_ops->pt_job_ops->current_op;
> >  	pt_update_ops->needs_svm_lock = true;
> >  	pt_update_ops->needs_invalidation = true;
> >  
> > @@ -2122,7 +2146,6 @@ static int op_prepare(struct xe_vm *vm,
> >  static void
> >  xe_pt_update_ops_init(struct xe_vm_pgtable_update_ops
> > *pt_update_ops)
> >  {
> > -	init_llist_head(&pt_update_ops->deferred);
> >  	pt_update_ops->start = ~0x0ull;
> >  	pt_update_ops->last = 0x0ull;
> >  }
> > @@ -2163,7 +2186,7 @@ int xe_pt_update_ops_prepare(struct xe_tile
> > *tile, struct xe_vma_ops *vops)
> >  			return err;
> >  	}
> >  
> > -	xe_tile_assert(tile, pt_update_ops->current_op <=
> > +	xe_tile_assert(tile, pt_update_ops->pt_job_ops->current_op
> > <=
> >  		       pt_update_ops->num_ops);
> >  
> >  #ifdef TEST_VM_OPS_ERROR
> > @@ -2396,7 +2419,7 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> > struct xe_vma_ops *vops)
> >  	lockdep_assert_held(&vm->lock);
> >  	xe_vm_assert_held(vm);
> >  
> > -	if (!pt_update_ops->current_op) {
> > +	if (!pt_update_ops->pt_job_ops->current_op) {
> >  		xe_tile_assert(tile, xe_vm_in_fault_mode(vm));
> >  
> >  		return dma_fence_get_stub();
> > @@ -2445,12 +2468,16 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> > struct xe_vma_ops *vops)
> >  		goto free_rfence;
> >  	}
> >  
> > -	/* Point of no return - VM killed if failure after this */
> > -	for (i = 0; i < pt_update_ops->current_op; ++i) {
> > -		struct xe_vm_pgtable_update_op *pt_op =
> > &pt_update_ops->ops[i];
> > +	/*
> > +	 * Point of no return - VM killed if failure after this
> > +	 */
> > +	for (i = 0; i < pt_update_ops->pt_job_ops->current_op; ++i)
> > {
> > +		struct xe_vm_pgtable_update_op *pt_op =
> > +			to_pt_op(pt_update_ops, i);
> >  
> >  		xe_pt_commit(pt_op->vma, pt_op->entries,
> > -			     pt_op->num_entries, &pt_update_ops-
> > >deferred);
> > +			     pt_op->num_entries,
> > +			     &pt_update_ops->pt_job_ops->deferred);
> >  		pt_op->vma = NULL;	/* skip in
> > xe_pt_update_ops_abort */
> >  	}
> >  
> > @@ -2530,27 +2557,19 @@ xe_pt_update_ops_run(struct xe_tile *tile,
> > struct xe_vma_ops *vops)
> >  ALLOW_ERROR_INJECTION(xe_pt_update_ops_run, ERRNO);
> >  
> >  /**
> > - * xe_pt_update_ops_fini() - Finish PT update operations
> > - * @tile: Tile of PT update operations
> > - * @vops: VMA operations
> > + * xe_pt_update_ops_free() - Free PT update operations
> > + * @pt_op: Array of PT update operations
> > + * @num_ops: Number of PT update operations
> >   *
> > - * Finish PT update operations by committing to destroy page table
> > memory
> > + * Free PT update operations
> >   */
> > -void xe_pt_update_ops_fini(struct xe_tile *tile, struct xe_vma_ops
> > *vops)
> > +static void xe_pt_update_ops_free(struct xe_vm_pgtable_update_op
> > *pt_op,
> > +				  u32 num_ops)
> >  {
> > -	struct xe_vm_pgtable_update_ops *pt_update_ops =
> > -		&vops->pt_update_ops[tile->id];
> > -	int i;
> > -
> > -	lockdep_assert_held(&vops->vm->lock);
> > -	xe_vm_assert_held(vops->vm);
> > -
> > -	for (i = 0; i < pt_update_ops->current_op; ++i) {
> > -		struct xe_vm_pgtable_update_op *pt_op =
> > &pt_update_ops->ops[i];
> > +	u32 i;
> >  
> > +	for (i = 0; i < num_ops; ++i, ++pt_op)
> >  		xe_pt_free_bind(pt_op->entries, pt_op->num_entries);
> > -	}
> > -	xe_bo_put_commit(&vops->pt_update_ops[tile->id].deferred);
> >  }
> >  
> >  /**
> > @@ -2571,9 +2590,9 @@ void xe_pt_update_ops_abort(struct xe_tile
> > *tile, struct xe_vma_ops *vops)
> >  
> >  	for (i = pt_update_ops->num_ops - 1; i >= 0; --i) {
> >  		struct xe_vm_pgtable_update_op *pt_op =
> > -			&pt_update_ops->ops[i];
> > +			to_pt_op(pt_update_ops, i);
> >  
> > -		if (!pt_op->vma || i >= pt_update_ops->current_op)
> > +		if (!pt_op->vma || i >= pt_update_ops->pt_job_ops-
> > >current_op)
> >  			continue;
> >  
> >  		if (pt_op->bind)
> > @@ -2584,6 +2603,89 @@ void xe_pt_update_ops_abort(struct xe_tile
> > *tile, struct xe_vma_ops *vops)
> >  			xe_pt_abort_unbind(pt_op->vma, pt_op-
> > >entries,
> >  					   pt_op->num_entries);
> >  	}
> > +}
> > +
> > +/**
> > + * xe_pt_job_ops_alloc() - Allocate PT job ops
> > + * @num_ops: Number of VM PT update ops
> > + *
> > + * Allocate PT job ops and internal array of VM PT update ops.
> > + *
> > + * Return: Pointer to PT job ops or NULL
> > + */
> > +struct xe_pt_job_ops *xe_pt_job_ops_alloc(u32 num_ops)
> > +{
> > +	struct xe_pt_job_ops *pt_job_ops;
> > +
> > +	pt_job_ops = kmalloc(sizeof(*pt_job_ops), GFP_KERNEL);
> > +	if (!pt_job_ops)
> > +		return NULL;
> > +
> > +	pt_job_ops->ops = kvmalloc_array(num_ops,
> > sizeof(*pt_job_ops->ops),
> > +					 GFP_KERNEL);
> > +	if (!pt_job_ops->ops) {
> > +		kvfree(pt_job_ops);
> > +		return NULL;
> > +	}
> > +
> > +	pt_job_ops->current_op = 0;
> > +	kref_init(&pt_job_ops->refcount);
> > +	init_llist_head(&pt_job_ops->deferred);
> > +
> > +	return pt_job_ops;
> > +}
> > +
> > +/**
> > + * xe_pt_job_ops_get() - Get PT job ops
> > + * @pt_job_ops: PT job ops to get
> > + *
> > + * Take a reference to PT job ops
> > + *
> > + * Return: Pointer to PT job ops or NULL
> > + */
> > +struct xe_pt_job_ops *xe_pt_job_ops_get(struct xe_pt_job_ops
> > *pt_job_ops)
> > +{
> > +	if (pt_job_ops)
> > +		kref_get(&pt_job_ops->refcount);
> > +
> > +	return pt_job_ops;
> > +}
> > +
> > +static void xe_pt_job_ops_destroy(struct kref *ref)
> > +{
> > +	struct xe_pt_job_ops *pt_job_ops =
> > +		container_of(ref, struct xe_pt_job_ops, refcount);
> > +	struct llist_node *freed;
> > +	struct xe_bo *bo, *next;
> > +
> > +	xe_pt_update_ops_free(pt_job_ops->ops,
> > +			      pt_job_ops->current_op);
> > +
> > +	freed = llist_del_all(&pt_job_ops->deferred);
> > +	if (freed) {
> > +		llist_for_each_entry_safe(bo, next, freed, freed)
> > +			/*
> > +			 * If called from run_job, we are in the
> > dma-fencing
> > +			 * path and cannot take dma-resv locks so
> > use an async
> > +			 * put.
> > +			 */
> > +			xe_bo_put_async(bo);
> > +	}
> > +
> > +	kvfree(pt_job_ops->ops);
> > +	kfree(pt_job_ops);
> > +}
> > +
> > +/**
> > + * xe_pt_job_ops_put() - Put PT job ops
> > + * @pt_job_ops: PT job ops to put
> > + *
> > + * Drop a reference to PT job ops
> > + */
> > +void xe_pt_job_ops_put(struct xe_pt_job_ops *pt_job_ops)
> > +{
> > +	if (!pt_job_ops)
> > +		return;
> >  
> > -	xe_pt_update_ops_fini(tile, vops);
> > +	kref_put(&pt_job_ops->refcount, xe_pt_job_ops_destroy);
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_pt.h b/drivers/gpu/drm/xe/xe_pt.h
> > index 5ecf003d513c..c9904573db82 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.h
> > +++ b/drivers/gpu/drm/xe/xe_pt.h
> > @@ -41,11 +41,14 @@ void xe_pt_clear(struct xe_device *xe, struct
> > xe_pt *pt);
> >  int xe_pt_update_ops_prepare(struct xe_tile *tile, struct xe_vma_ops
> > *vops);
> >  struct dma_fence *xe_pt_update_ops_run(struct xe_tile *tile,
> >  				       struct xe_vma_ops *vops);
> > -void xe_pt_update_ops_fini(struct xe_tile *tile, struct xe_vma_ops
> > *vops);
> >  void xe_pt_update_ops_abort(struct xe_tile *tile, struct xe_vma_ops
> > *vops);
> >  
> >  bool xe_pt_zap_ptes(struct xe_tile *tile, struct xe_vma *vma);
> >  bool xe_pt_zap_ptes_range(struct xe_tile *tile, struct xe_vm *vm,
> >  			  struct xe_svm_range *range);
> >  
> > +struct xe_pt_job_ops *xe_pt_job_ops_alloc(u32 num_ops);
> > +struct xe_pt_job_ops *xe_pt_job_ops_get(struct xe_pt_job_ops
> > *pt_job_ops);
> > +void xe_pt_job_ops_put(struct xe_pt_job_ops *pt_job_ops);
> > +
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_pt_types.h
> > b/drivers/gpu/drm/xe/xe_pt_types.h
> > index 69eab6f37cfe..33d0d20e0ac6 100644
> > --- a/drivers/gpu/drm/xe/xe_pt_types.h
> > +++ b/drivers/gpu/drm/xe/xe_pt_types.h
> > @@ -70,6 +70,9 @@ struct xe_vm_pgtable_update {
> >  	/** @pt_entries: Newly added pagetable entries */
> >  	struct xe_pt_entry *pt_entries;
> >  
> > +	/** @level: level of update */
> > +	unsigned int level;
> > +
> >  	/** @flags: Target flags */
> >  	u32 flags;
> >  };
> > @@ -88,12 +91,28 @@ struct xe_vm_pgtable_update_op {
> >  	bool rebind;
> >  };
> >  
> > -/** struct xe_vm_pgtable_update_ops: page table update operations */
> > -struct xe_vm_pgtable_update_ops {
> > -	/** @ops: operations */
> > -	struct xe_vm_pgtable_update_op *ops;
> > +/**
> > + * struct xe_pt_job_ops: page table update operations dynamic
> > allocation
> > + *
> > + * This is the part of struct xe_vma_ops and struct
> > xe_vm_pgtable_update_ops
> > + * which is dynamic allocated as it must be available until the bind
> > job is
> > + * complete.
> > + */
> > +struct xe_pt_job_ops {
> > +	/** @current_op: current operations */
> > +	u32 current_op;
> > +	/** @refcount: ref count ops allocation */
> > +	struct kref refcount;
> >  	/** @deferred: deferred list to destroy PT entries */
> >  	struct llist_head deferred;
> > +	/** @ops: operations */
> > +	struct xe_vm_pgtable_update_op *ops;
> > +};
> > +
> > +/** struct xe_vm_pgtable_update_ops: page table update operations */
> > +struct xe_vm_pgtable_update_ops {
> > +	/** @pt_job_ops: PT update operations dynamic allocation*/
> > +	struct xe_pt_job_ops *pt_job_ops;
> >  	/** @q: exec queue for PT operations */
> >  	struct xe_exec_queue *q;
> >  	/** @start: start address of ops */
> > @@ -102,8 +121,6 @@ struct xe_vm_pgtable_update_ops {
> >  	u64 last;
> >  	/** @num_ops: number of operations */
> >  	u32 num_ops;
> > -	/** @current_op: current operations */
> > -	u32 current_op;
> >  	/** @needs_svm_lock: Needs SVM lock */
> >  	bool needs_svm_lock;
> >  	/** @needs_userptr_lock: Needs userptr lock */
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
> > b/drivers/gpu/drm/xe/xe_sched_job.c
> > index d21bf8f26964..09cdd14d9ef7 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job.c
> > +++ b/drivers/gpu/drm/xe/xe_sched_job.c
> > @@ -26,19 +26,22 @@ static struct kmem_cache
> > *xe_sched_job_parallel_slab;
> >  
> >  int __init xe_sched_job_module_init(void)
> >  {
> > +	struct xe_sched_job *job;
> > +	size_t size;
> > +
> > +	size = struct_size(job, ptrs, 1);
> >  	xe_sched_job_slab =
> > -		kmem_cache_create("xe_sched_job",
> > -				  sizeof(struct xe_sched_job) +
> > -				  sizeof(struct xe_job_ptrs), 0,
> > +		kmem_cache_create("xe_sched_job", size, 0,
> >  				  SLAB_HWCACHE_ALIGN, NULL);
> >  	if (!xe_sched_job_slab)
> >  		return -ENOMEM;
> >  
> > +	size = max_t(size_t,
> > +		     struct_size(job, ptrs,
> > +				 XE_HW_ENGINE_MAX_INSTANCE),
> > +		     struct_size(job, pt_update, 1));
> >  	xe_sched_job_parallel_slab =
> > -		kmem_cache_create("xe_sched_job_parallel",
> > -				  sizeof(struct xe_sched_job) +
> > -				  sizeof(struct xe_job_ptrs) *
> > -				  XE_HW_ENGINE_MAX_INSTANCE, 0,
> > +		kmem_cache_create("xe_sched_job_parallel", size, 0,
> >  				  SLAB_HWCACHE_ALIGN, NULL);
> >  	if (!xe_sched_job_parallel_slab) {
> >  		kmem_cache_destroy(xe_sched_job_slab);
> > @@ -84,7 +87,7 @@ static void xe_sched_job_free_fences(struct
> > xe_sched_job *job)
> >  {
> >  	int i;
> >  
> > -	for (i = 0; i < job->q->width; ++i) {
> > +	for (i = 0; !job->is_pt_job && i < job->q->width; ++i) {
> >  		struct xe_job_ptrs *ptrs = &job->ptrs[i];
> >  
> >  		if (ptrs->lrc_fence)
> > @@ -118,33 +121,44 @@ struct xe_sched_job *xe_sched_job_create(struct
> > xe_exec_queue *q,
> >  	if (err)
> >  		goto err_free;
> >  
> > -	for (i = 0; i < q->width; ++i) {
> > -		struct dma_fence *fence =
> > xe_lrc_alloc_seqno_fence();
> > -		struct dma_fence_chain *chain;
> > -
> > -		if (IS_ERR(fence)) {
> > -			err = PTR_ERR(fence);
> > -			goto err_sched_job;
> > -		}
> > -		job->ptrs[i].lrc_fence = fence;
> > -
> > -		if (i + 1 == q->width)
> > -			continue;
> > -
> > -		chain = dma_fence_chain_alloc();
> > -		if (!chain) {
> > +	if (!batch_addr) {
> > +		job->fence =
> > dma_fence_allocate_private_stub(ktime_get());
> > +		if (!job->fence) {
> >  			err = -ENOMEM;
> >  			goto err_sched_job;
> >  		}
> > -		job->ptrs[i].chain_fence = chain;
> > +		job->is_pt_job = true;
> > +	} else {
> > +		for (i = 0; i < q->width; ++i) {
> > +			struct dma_fence *fence =
> > xe_lrc_alloc_seqno_fence();
> > +			struct dma_fence_chain *chain;
> > +
> > +			if (IS_ERR(fence)) {
> > +				err = PTR_ERR(fence);
> > +				goto err_sched_job;
> > +			}
> > +			job->ptrs[i].lrc_fence = fence;
> > +
> > +			if (i + 1 == q->width)
> > +				continue;
> > +
> > +			chain = dma_fence_chain_alloc();
> > +			if (!chain) {
> > +				err = -ENOMEM;
> > +				goto err_sched_job;
> > +			}
> > +			job->ptrs[i].chain_fence = chain;
> > +		}
> >  	}
> >  
> > -	width = q->width;
> > -	if (is_migration)
> > -		width = 2;
> > +	if (batch_addr) {
> > +		width = q->width;
> > +		if (is_migration)
> > +			width = 2;
> >  
> > -	for (i = 0; i < width; ++i)
> > -		job->ptrs[i].batch_addr = batch_addr[i];
> > +		for (i = 0; i < width; ++i)
> > +			job->ptrs[i].batch_addr = batch_addr[i];
> > +	}
> >  
> >  	xe_pm_runtime_get_noresume(job_to_xe(job));
> >  	trace_xe_sched_job_create(job);
> > @@ -243,7 +257,7 @@ bool xe_sched_job_completed(struct xe_sched_job
> > *job)
> >  void xe_sched_job_arm(struct xe_sched_job *job)
> >  {
> >  	struct xe_exec_queue *q = job->q;
> > -	struct dma_fence *fence, *prev;
> > +	struct dma_fence *fence = job->fence, *prev;
> >  	struct xe_vm *vm = q->vm;
> >  	u64 seqno = 0;
> >  	int i;
> > @@ -263,6 +277,9 @@ void xe_sched_job_arm(struct xe_sched_job *job)
> >  		job->ring_ops_flush_tlb = true;
> >  	}
> >  
> > +	if (job->is_pt_job)
> > +		goto arm;
> > +
> >  	/* Arm the pre-allocated fences */
> >  	for (i = 0; i < q->width; prev = fence, ++i) {
> >  		struct dma_fence_chain *chain;
> > @@ -283,6 +300,7 @@ void xe_sched_job_arm(struct xe_sched_job *job)
> >  		fence = &chain->base;
> >  	}
> >  
> > +arm:
> >  	job->fence = dma_fence_get(fence);	/* Pairs with put in
> > scheduler */
> >  	drm_sched_job_arm(&job->drm);
> >  }
> > diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > index dbf260dded8d..79a459f2a0a8 100644
> > --- a/drivers/gpu/drm/xe/xe_sched_job_types.h
> > +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
> > @@ -10,10 +10,29 @@
> >  
> >  #include <drm/gpu_scheduler.h>
> >  
> > -struct xe_exec_queue;
> >  struct dma_fence;
> >  struct dma_fence_chain;
> >  
> > +struct xe_exec_queue;
> > +struct xe_migrate_pt_update_ops;
> > +struct xe_pt_job_ops;
> > +struct xe_tile;
> > +struct xe_vm;
> > +
> > +/**
> > + * struct xe_pt_update_args - PT update arguments
> > + */
> > +struct xe_pt_update_args {
> > +	/** @vm: VM */
> > +	struct xe_vm *vm;
> > +	/** @tile: Tile */
> > +	struct xe_tile *tile;
> > +	/** @ops: Migrate PT update ops */
> > +	const struct xe_migrate_pt_update_ops *ops;
> > +	/** @pt_job_ops: PT update ops */
> > +	struct xe_pt_job_ops *pt_job_ops;
> > +};
> > +
> >  /**
> >   * struct xe_job_ptrs - Per hw engine instance data
> >   */
> > @@ -58,8 +77,14 @@ struct xe_sched_job {
> >  	bool ring_ops_flush_tlb;
> >  	/** @ggtt: mapped in ggtt. */
> >  	bool ggtt;
> > -	/** @ptrs: per instance pointers. */
> > -	struct xe_job_ptrs ptrs[];
> > +	/** @is_pt_job: is a PT job */
> > +	bool is_pt_job;
> > +	union {
> > +		/** @ptrs: per instance pointers. */
> > +		DECLARE_FLEX_ARRAY(struct xe_job_ptrs, ptrs);
> > +		/** @pt_update: PT update arguments */
> > +		DECLARE_FLEX_ARRAY(struct xe_pt_update_args,
> > pt_update);
> > +	};
> >  };
> >  
> >  struct xe_sched_job_snapshot {
> > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > index 18f967ce1f1a..6fc01fdd7286 100644
> > --- a/drivers/gpu/drm/xe/xe_vm.c
> > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > @@ -780,6 +780,19 @@ int xe_vm_userptr_check_repin(struct xe_vm *vm)
> >  		list_empty_careful(&vm->userptr.invalidated)) ? 0 :
> > -EAGAIN;
> >  }
> >  
> > +static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm
> > *vm,
> > +			    struct xe_exec_queue *q,
> > +			    struct xe_sync_entry *syncs, u32
> > num_syncs)
> > +{
> > +	memset(vops, 0, sizeof(*vops));
> > +	INIT_LIST_HEAD(&vops->list);
> > +	vops->vm = vm;
> > +	vops->q = q;
> > +	vops->syncs = syncs;
> > +	vops->num_syncs = num_syncs;
> > +	vops->flags = 0;
> > +}
> > +
> >  static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool
> > array_of_binds)
> >  {
> >  	int i;
> > @@ -788,11 +801,9 @@ static int xe_vma_ops_alloc(struct xe_vma_ops
> > *vops, bool array_of_binds)
> >  		if (!vops->pt_update_ops[i].num_ops)
> >  			continue;
> >  
> > -		vops->pt_update_ops[i].ops =
> > -			kmalloc_array(vops-
> > >pt_update_ops[i].num_ops,
> > -				      sizeof(*vops-
> > >pt_update_ops[i].ops),
> > -				      GFP_KERNEL |
> > __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
> > -		if (!vops->pt_update_ops[i].ops)
> > +		vops->pt_update_ops[i].pt_job_ops =
> > +			xe_pt_job_ops_alloc(vops-
> > >pt_update_ops[i].num_ops);
> > +		if (!vops->pt_update_ops[i].pt_job_ops)
> >  			return array_of_binds ? -ENOBUFS : -ENOMEM;
> >  	}
> >  
> > @@ -828,7 +839,7 @@ static void xe_vma_ops_fini(struct xe_vma_ops
> > *vops)
> >  	xe_vma_svm_prefetch_ops_fini(vops);
> >  
> >  	for (i = 0; i < XE_MAX_TILES_PER_DEVICE; ++i)
> > -		kfree(vops->pt_update_ops[i].ops);
> > +		xe_pt_job_ops_put(vops-
> > >pt_update_ops[i].pt_job_ops);
> >  }
> >  
> >  static void xe_vma_ops_incr_pt_update_ops(struct xe_vma_ops *vops,
> > u8 tile_mask, int inc_val)
> > @@ -877,9 +888,6 @@ static int xe_vm_ops_add_rebind(struct xe_vma_ops
> > *vops, struct xe_vma *vma,
> >  
> >  static struct dma_fence *ops_execute(struct xe_vm *vm,
> >  				     struct xe_vma_ops *vops);
> > -static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm
> > *vm,
> > -			    struct xe_exec_queue *q,
> > -			    struct xe_sync_entry *syncs, u32
> > num_syncs);
> >  
> >  int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
> >  {
> > @@ -3163,13 +3171,6 @@ static struct dma_fence *ops_execute(struct
> > xe_vm *vm,
> >  		fence = &cf->base;
> >  	}
> >  
> > -	for_each_tile(tile, vm->xe, id) {
> > -		if (!vops->pt_update_ops[id].num_ops)
> > -			continue;
> > -
> > -		xe_pt_update_ops_fini(tile, vops);
> > -	}
> > -
> >  	return fence;
> >  
> >  err_out:
> > @@ -3447,19 +3448,6 @@ static int vm_bind_ioctl_signal_fences(struct
> > xe_vm *vm,
> >  	return err;
> >  }
> >  
> > -static void xe_vma_ops_init(struct xe_vma_ops *vops, struct xe_vm
> > *vm,
> > -			    struct xe_exec_queue *q,
> > -			    struct xe_sync_entry *syncs, u32
> > num_syncs)
> > -{
> > -	memset(vops, 0, sizeof(*vops));
> > -	INIT_LIST_HEAD(&vops->list);
> > -	vops->vm = vm;
> > -	vops->q = q;
> > -	vops->syncs = syncs;
> > -	vops->num_syncs = num_syncs;
> > -	vops->flags = 0;
> > -}
> > -
> >  static int xe_vm_bind_ioctl_validate_bo(struct xe_device *xe, struct
> > xe_bo *bo,
> >  					u64 addr, u64 range, u64
> > obj_offset,
> >  					u16 pat_index, u32 op, u32
> > bind_flags)
>