[PATCH 1/3] drm/xe/bo: Forward the decision to evict local objects during validation

Matthew Brost matthew.brost at intel.com
Mon Feb 26 21:36:15 UTC 2024


On Mon, Feb 26, 2024 at 05:44:53PM +0100, Thomas Hellström wrote:
> Currently we refuse evicting the VM's local objects. However that
> is necessary for some objects. Most notably completely unbound objects.
> Forward this decision to be per-object based in the TTM
> eviction_valuable() callback.
> 
> Fixes: 24f947d58fe5 ("drm/xe: Use DRM GPUVM helpers for external- and evicted objects")
> Cc: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
> Cc: Oded Gabbay <ogabbay at kernel.org>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
>  drivers/gpu/drm/xe/xe_bo.c       | 17 ++++++++++++++++-
>  drivers/gpu/drm/xe/xe_exec.c     |  2 ++
>  drivers/gpu/drm/xe/xe_vm.c       |  8 ++++++--
>  drivers/gpu/drm/xe/xe_vm_types.h | 10 ++++++++++
>  4 files changed, 34 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
> index 76dfaf1cd200..143d93632253 100644
> --- a/drivers/gpu/drm/xe/xe_bo.c
> +++ b/drivers/gpu/drm/xe/xe_bo.c
> @@ -1049,6 +1049,21 @@ static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
>  	}
>  }
>  
> +static bool xe_bo_eviction_valuable(struct ttm_buffer_object *ttm_bo,
> +				    const struct ttm_place *place)
> +{
> +	if (xe_bo_is_xe_bo(ttm_bo)) {
> +		struct xe_bo *xe_bo = ttm_to_xe_bo(ttm_bo);
> +		struct xe_vm *vm = xe_bo->vm;
> +
> +		if (vm && !drm_gpuvm_is_extobj(&vm->gpuvm, &ttm_bo->base) &&

Is this not redundant - if xe_bo->vm is set then it not an external BO?

> +		    vm->is_validating)
> +			return false;
> +	}
> +
> +	return ttm_bo_eviction_valuable(ttm_bo, place);
> +}
> +
>  const struct ttm_device_funcs xe_ttm_funcs = {
>  	.ttm_tt_create = xe_ttm_tt_create,
>  	.ttm_tt_populate = xe_ttm_tt_populate,
> @@ -1059,7 +1074,7 @@ const struct ttm_device_funcs xe_ttm_funcs = {
>  	.io_mem_reserve = xe_ttm_io_mem_reserve,
>  	.io_mem_pfn = xe_ttm_io_mem_pfn,
>  	.release_notify = xe_ttm_bo_release_notify,
> -	.eviction_valuable = ttm_bo_eviction_valuable,
> +	.eviction_valuable = xe_bo_eviction_valuable,
>  	.delete_mem_notify = xe_ttm_bo_delete_mem_notify,
>  };
>  
> diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
> index 952496c6260d..e8eb09993485 100644
> --- a/drivers/gpu/drm/xe/xe_exec.c
> +++ b/drivers/gpu/drm/xe/xe_exec.c
> @@ -102,7 +102,9 @@ static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec)
>  	int num_fences;
>  	int ret;
>  
> +	vm->is_validating = true;
>  	ret = drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec);
> +	vm->is_validating = false;

Can't say I love this coding pattern but you address this in the kernel
doc for is_validating. Agree with updating TTM to avoid this.

Also this is kinda racey too - e.g. 2 VMs could set vm->is_validating at
the same time. We really want to check if BO being evicted VM's matches
the VM we are trying to validate in the current context, right?

With that - maybe we try to update TTM in this series too?

>  	if (ret)
>  		return ret;
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index e3bde897f6e8..07b0cda12d52 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -474,7 +474,7 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
>  		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
>  			       &vm->rebind_list);
>  
> -	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
> +	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, true);

I think we likely can flip the last argument to true in a few other
places too. Or maybe even drop the argument and either always set to
true or true if not an ext obj?

Matt

>  	if (ret)
>  		return ret;
>  
> @@ -515,7 +515,11 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
>  	if (err)
>  		return err;
>  
> -	return drm_gpuvm_validate(&vm->gpuvm, exec);
> +	vm->is_validating = true;
> +	err = drm_gpuvm_validate(&vm->gpuvm, exec);
> +	vm->is_validating = false;
> +
> +	return err;
>  }
>  
>  static void preempt_rebind_work_func(struct work_struct *w)
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
> index 7d4f810f9c04..76c49fba9cd3 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -286,6 +286,16 @@ struct xe_vm {
>  
>  	/** @batch_invalidate_tlb: Always invalidate TLB before batch start */
>  	bool batch_invalidate_tlb;
> +
> +	/**
> +	 * @is_validaing: Whether we are validating the vm's local objects.
> +	 * This field is protected by the vm's resv. Note that this
> +	 * is needed only since TTM doesn't forward the ttm_operation_ctx to the
> +	 * eviction_valuable() callback, so if / when that is in place, this
> +	 * should be removed.
> +	 */
> +	bool is_validating;
> +
>  	/** @xef: XE file handle for tracking this VM's drm client */
>  	struct xe_file *xef;
>  };
> -- 
> 2.43.0
> 


More information about the Intel-xe mailing list