[PATCH v3 3/5] drm/xe: Convert multiple bind ops into single job

Thu May 30 00:32:12 UTC 2024

On Wed, 2024-05-29 at 11:31 -0700, Matthew Brost wrote:
> This aligns with the uAPI of an array of binds or single bind that
> results in multiple GPUVA ops to be considered a single atomic
> operations.
> 
> The implemenation is roughly:
> - xe_vma_ops is a list of xe_vma_op (GPUVA op)
> - each xe_vma_op resolves to 0-3 PT ops
> - xe_vma_ops creates a single job
> - if at any point during binding a failure occurs, xe_vma_ops contains
>   the information necessary unwind the PT and VMA (GPUVA) state
> 
> v2:
>  - add missing dma-resv slot reservation (CI, testing)
> 
> Cc: Oak Zeng <oak.zeng at intel.com>
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_bo_types.h |    2 +
>  drivers/gpu/drm/xe/xe_migrate.c  |  296 ++++----
>  drivers/gpu/drm/xe/xe_migrate.h  |   32 +-
>  drivers/gpu/drm/xe/xe_pt.c       | 1108 +++++++++++++++++++-----------
>  drivers/gpu/drm/xe/xe_pt.h       |   14 +-
>  drivers/gpu/drm/xe/xe_pt_types.h |   36 +
>  drivers/gpu/drm/xe/xe_vm.c       |  519 +++-----------
>  drivers/gpu/drm/xe/xe_vm.h       |    2 +
>  drivers/gpu/drm/xe/xe_vm_types.h |   45 +-
>  9 files changed, 1032 insertions(+), 1022 deletions(-)
> 

(snip)

> 
> -/**
> - * __xe_pt_bind_vma() - Build and connect a page-table tree for the vma
> - * address range.
> - * @tile: The tile to bind for.
> - * @vma: The vma to bind.
> - * @q: The exec_queue with which to do pipelined page-table updates.
> - * @syncs: Entries to sync on before binding the built tree to the live vm tree.
> - * @num_syncs: Number of @sync entries.
> - * @rebind: Whether we're rebinding this vma to the same address range without
> - * an unbind in-between.
> - *
> - * This function builds a page-table tree (see xe_pt_stage_bind() for more
> - * information on page-table building), and the xe_vm_pgtable_update entries
> - * abstracting the operations needed to attach it to the main vm tree. It
> - * then takes the relevant locks and updates the metadata side of the main
> - * vm tree and submits the operations for pipelined attachment of the
> - * gpu page-table to the vm main tree, (which can be done either by the
> - * cpu and the GPU).
> - *
> - * Return: A valid dma-fence representing the pipelined attachment operation
> - * on success, an error pointer on error.
> - */
> -struct dma_fence *
> -__xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue *q,
> -		 struct xe_sync_entry *syncs, u32 num_syncs,
> -		 bool rebind)
> -{
> -	struct xe_vm_pgtable_update entries[XE_VM_MAX_LEVEL * 2 + 1];
> -	struct xe_pt_migrate_pt_update bind_pt_update = {
> -		.base = {
> -			.ops = xe_vma_is_userptr(vma) ? &userptr_bind_ops : &bind_ops,
> -			.vma = vma,
> -			.tile_id = tile->id,
> -		},
> -		.bind = true,
> -	};
> -	struct xe_vm *vm = xe_vma_vm(vma);
> -	u32 num_entries;
> -	struct dma_fence *fence;
> -	struct invalidation_fence *ifence = NULL;
> -	struct xe_range_fence *rfence;
> -	int err;
> -
> -	bind_pt_update.locked = false;
> -	xe_bo_assert_held(xe_vma_bo(vma));
> -	xe_vm_assert_held(vm);
> -
> -	vm_dbg(&xe_vma_vm(vma)->xe->drm,
> -	       "Preparing bind, with range [%llx...%llx) engine %p.\n",
> -	       xe_vma_start(vma), xe_vma_end(vma), q);
> -
> -	err = xe_pt_prepare_bind(tile, vma, entries, &num_entries);
> -	if (err)
> -		goto err;
> -
> -	err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
> -	if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
> -		err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
> -	if (err)
> -		goto err;
> -
> -	xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
> -
> -	xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
> -	xe_pt_calc_rfence_interval(vma, &bind_pt_update, entries,
> -				   num_entries);
> -
> -	/*
> -	 * If rebind, we have to invalidate TLB on !LR vms to invalidate
> -	 * cached PTEs point to freed memory. on LR vms this is done
> -	 * automatically when the context is re-enabled by the rebind worker,
> -	 * or in fault mode it was invalidated on PTE zapping.
> -	 *
> -	 * If !rebind, and scratch enabled VMs, there is a chance the scratch
> -	 * PTE is already cached in the TLB so it needs to be invalidated.
> -	 * on !LR VMs this is done in the ring ops preceding a batch, but on
> -	 * non-faulting LR, in particular on user-space batch buffer chaining,
> -	 * it needs to be done here.
> -	 */
> -	if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
> -		ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
> -		if (!ifence)
> -			return ERR_PTR(-ENOMEM);
> -	} else if (rebind && !xe_vm_in_lr_mode(vm)) {
> -		/* We bump also if batch_invalidate_tlb is true */
> -		vm->tlb_flush_seqno++;

This was the only thing actually setting a value of "tlb_flush_seqno"
to anything in the driver. Now the only thing that remains setting any
value to tlb_flush_seqno is xe_sched_job_arm() where we have:

q->tlb_flush_seqno = vm->tlb_flush_seqno;

(but there seems to be no line ever initializing vm->tlb_flush_seqno)

Something may be wrong here. We're not setting initial values but we're
copying values and checking them in an "if" statement.

The following compiles:

diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index e81704c7c030a..dd51b59e4433f 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -136,11 +136,6 @@ struct xe_exec_queue {
        const struct xe_ring_ops *ring_ops;
        /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */
        struct drm_sched_entity *entity;
-       /**
-        * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed
-        * Protected by @vm's resv. Unused if @vm == NULL.
-        */
-       u64 tlb_flush_seqno;
        /** @old_run_ticks: prior hw engine class run time in ticks for this exec queue */
        u64 old_run_ticks;
        /** @run_ticks: hw engine class run time in ticks for this exec queue */
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c
index 29f3201d7dfac..6c3cb7e295ac7 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -254,9 +254,8 @@ void xe_sched_job_arm(struct xe_sched_job *job)
        }
 
        if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
-           (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) {
+           vm->batch_invalidate_tlb) {
                xe_vm_assert_held(vm);
-               q->tlb_flush_seqno = vm->tlb_flush_seqno;
                job->ring_ops_flush_tlb = true;
        }
 
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 27d651093d307..03e839efb234e 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -263,11 +263,6 @@ struct xe_vm {
                bool capture_once;
        } error_capture;
 
-       /**
-        * @tlb_flush_seqno: Required TLB flush seqno for the next exec.
-        * protected by the vm resv.
-        */
-       u64 tlb_flush_seqno;
        /** @batch_invalidate_tlb: Always invalidate TLB before batch start */
        bool batch_invalidate_tlb;
        /** @xef: XE file handle for tracking this VM's drm client */