[PATCH 05/15] drm/xe: Introduce an xe_validation wrapper around drm_exec

Matthew Brost matthew.brost at intel.com
Thu Aug 14 02:33:58 UTC 2025


On Wed, Aug 13, 2025 at 12:51:11PM +0200, Thomas Hellström wrote:
> Introduce a validation wrapper xe_validation_guard() as a helper
> intended to be used around drm_exec transactions what perform
> validations. Once TTM can handle exhaustive eviction we could
> remove this wrapper or make it mostly a NO-OP unless other
> functionality is added to it.
> 
> Currently the wrapper takes a read lock upon entry and if the
> transaction hits an OOM, all locks are released and the
> transaction is retried with a write-lock. If all other
> validations participate in this scheme, the transaction with
> the write lock will be the only transaction validating and
> should have access to all available non-pinned memory.
> 
> There is currently a problem in that TTM converts -EDEADLOCKS to
> -ENOMEM, and with ww_mutex slowpath error injections, we can hit
> -ENOMEMs without having actually ran out of memory. We abuse
> ww_mutex internals to detect such situations until TTM is fixes
> to not convert the error code. In the meantime, injecting
> ww_mutex slowpath -EDEADLOCKs is a good way to test
> the implementation in the absence of real OOMs.
> 
> Just introduce the wrapper in this commit. It will be hooked up
> to the driver in following commits.
> 
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
>  drivers/gpu/drm/xe/xe_validation.c | 199 +++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_validation.h | 107 ++++++++++++++++
>  2 files changed, 306 insertions(+)
> 
> diff --git a/drivers/gpu/drm/xe/xe_validation.c b/drivers/gpu/drm/xe/xe_validation.c
> index cc0684d24e02..cd1424f04237 100644
> --- a/drivers/gpu/drm/xe/xe_validation.c
> +++ b/drivers/gpu/drm/xe/xe_validation.c
> @@ -5,6 +5,7 @@
>  #include "xe_bo.h"
>  #include <drm/drm_exec.h>
>  #include <drm/drm_gem.h>
> +#include <drm/drm_gpuvm.h>
>  
>  #include "xe_assert.h"
>  #include "xe_validation.h"
> @@ -47,3 +48,201 @@ void xe_validation_assert_exec(const struct xe_device *xe,
>  	}
>  }
>  #endif
> +
> +static int xe_validation_lock(struct xe_validation_ctx *ctx)
> +{
> +	struct xe_validation_device *val = ctx->val;
> +	int ret = 0;
> +
> +	if (ctx->flags & DRM_EXEC_INTERRUPTIBLE_WAIT) {
> +		if (ctx->request_exclusive)
> +			ret = down_write_killable(&val->lock);
> +		else
> +			ret = down_read_interruptible(&val->lock);
> +	} else {
> +		if (ctx->request_exclusive)
> +			down_write(&val->lock);
> +		else
> +			down_read(&val->lock);
> +	}
> +
> +	if (!ret) {
> +		ctx->lock_held = true;
> +		ctx->lock_held_exclusive = ctx->request_exclusive;
> +	}
> +
> +	return ret;
> +}
> +
> +static void xe_validation_unlock(struct xe_validation_ctx *ctx)
> +{
> +	if (!ctx->lock_held)
> +		return;
> +
> +	if (ctx->lock_held_exclusive)
> +		up_write(&ctx->val->lock);
> +	else
> +		up_read(&ctx->val->lock);
> +
> +	ctx->lock_held = false;
> +}
> +
> +/**
> + * xe_validation_ctx_init() - Initialize an xe_validation_ctx
> + * @ctx: The xe_validation_ctx to initialize.
> + * @val: The xe_validation_device representing the validation domain.
> + * @exec: The struct drm_exec to use for the transaction.
> + * @flags: The flags to use for drm_exec initialization.
> + * @nr: The number of anticipated buffer object locks. Forwarded to
> + * drm_exec initialization.
> + * @exclusive: Whether to use exclusive locking already on first validation.

The last two parameters of this function are always passed as 0 and
false in this series. Is it worth keeping them? I don’t see a case where
nr would ever be non-zero. exclusive is defensible, but it’s still
unused. Maybe drop both and reserve a bit in flags for a driver-defined
“exclusive.” That would make the call sites more readable—long argument
lists make it easy to forget what each parameter means or to transpose
them.

> + *
> + * Initialize and lock a an xe_validation transaction using the validation domain
> + * represented by @val. Also initialize the drm_exec object forwarding
> + * @flags and @nr to the drm_exec initialization. The @exclusive parameter should
> + * typically be set to false to avoid locking out other validators from the
> + * domain until an OOM is hit. For testing- or final attempt purposes it can,
> + * however, be set to true.
> + *
> + * Return: %0 on success, %-EINTR if interruptible initial locking failed with a
> + * signal pending.
> + */
> +int xe_validation_ctx_init(struct xe_validation_ctx *ctx, struct xe_validation_device *val,
> +			   struct drm_exec *exec, u32 flags, unsigned int nr,
> +			   bool exclusive)
> +{
> +	int ret;
> +
> +	ctx->exec = exec;
> +	ctx->val = val;
> +	ctx->lock_held = false;
> +	ctx->lock_held_exclusive = false;
> +	ctx->request_exclusive = exclusive;
> +	ctx->flags = flags;
> +	ctx->nr = nr;
> +
> +	ret = xe_validation_lock(ctx);
> +	if (ret)
> +		return ret;
> +
> +	drm_exec_init(exec, flags, nr);
> +
> +	return 0;
> +}
> +
> +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
> +/*
> + * This abuses both drm_exec and ww_mutex internals and should be
> + * replaced by checking for -EDEADLK when we can make TTM
> + * stop converting -EDEADLK to -ENOMEM.
> + * An alternative is to not have exhaustive eviction with
> + * CONFIG_DEBUG_WW_MUTEX_SLOWPATH until that happens.
> + */
> +static bool xe_validation_contention_injected(struct drm_exec *exec)
> +{
> +	return !!exec->ticket.contending_lock;
> +}
> +
> +#else
> +
> +static bool xe_validation_contention_injected(struct drm_exec *exec)
> +{
> +	return false;
> +}
> +
> +#endif
> +
> +static bool __xe_validation_should_retry(struct xe_validation_ctx *ctx, int ret)
> +{
> +	if (ret == -ENOMEM &&
> +	    ((ctx->request_exclusive &&
> +	      xe_validation_contention_injected(ctx->exec)) ||
> +	     !ctx->request_exclusive)) {
> +		ctx->request_exclusive = true;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * xe_validation_exec_lock() - Perform drm_gpuvm_exec_lock within a validation
> + * transaction.
> + * @ctx: An uninitialized xe_validation_ctx.
> + * @vm_exec: An initialized struct vm_exec.
> + * @val: The validation domain.
> + *
> + * The drm_gpuvm_exec_lock() function internally initializes its drm_exec
> + * transaction and therefore doesn't lend itself very well to be using
> + * xe_validation_ctx_init(). Provide a helper that takes an uninitialized
> + * xe_validation_ctx and calls drm_gpuvm_exec_lock() with OOM retry.
> + *
> + * Return: %0 on success, negative error code on failure.
> + */
> +int xe_validation_exec_lock(struct xe_validation_ctx *ctx,
> +			    struct drm_gpuvm_exec *vm_exec,
> +			    struct xe_validation_device *val)
> +{
> +	int ret;
> +
> +	memset(ctx, 0, sizeof(*ctx));
> +	ctx->exec = &vm_exec->exec;
> +	ctx->flags = vm_exec->flags;
> +	ctx->val = val;
> +retry:
> +	ret = xe_validation_lock(ctx);
> +	if (ret)
> +		return ret;
> +
> +	ret = drm_gpuvm_exec_lock(vm_exec);
> +	if (ret) {
> +		xe_validation_unlock(ctx);
> +		if (__xe_validation_should_retry(ctx, ret))
> +			goto retry;
> +	}
> +
> +	return ret;
> +}
> +
> +/**
> + * xe_validation_ctx_fini() - Finalize a validation transaction
> + * @ctx: The Validation transaction to finalize.
> + *
> + * Finalize a validation transaction and its related drm_exec transaction.
> + */
> +void xe_validation_ctx_fini(struct xe_validation_ctx *ctx)
> +{
> +	drm_exec_fini(ctx->exec);
> +	xe_validation_unlock(ctx);
> +}
> +
> +/**
> + * xe_validation_should_retry() - Determine if a validation transaction should retry
> + * @ctx: The validation transaction.
> + * @ret: Pointer to a return value variable.
> + *
> + * Determines whether a validation transaction should retry based on the
> + * internal transaction state and the return value pointed to by @ret.
> + * If a validation should be retried, the transaction is prepared for that,
> + * and the validation locked might be re-locked in exclusive mode, and *@ret
> + * is set to %0. If the re-locking errors, typically due to interruptible
> + * locking with signal pending, *@ret is instead set to -EINTR and the
> + * function returns %false.
> + *
> + * Return: %true if validation should be retried, %false otherwise.
> + */
> +bool xe_validation_should_retry(struct xe_validation_ctx *ctx, int *ret)
> +{
> +	if (__xe_validation_should_retry(ctx, *ret)) {
> +		drm_exec_fini(ctx->exec);
> +		*ret = 0;
> +		if (ctx->request_exclusive != ctx->lock_held_exclusive) {
> +			xe_validation_unlock(ctx);
> +			*ret = xe_validation_lock(ctx);
> +		}
> +		drm_exec_init(ctx->exec, ctx->flags, ctx->nr);
> +		return !*ret;
> +	}
> +
> +	return false;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_validation.h b/drivers/gpu/drm/xe/xe_validation.h
> index db50feacad7a..a708c260cf18 100644
> --- a/drivers/gpu/drm/xe/xe_validation.h
> +++ b/drivers/gpu/drm/xe/xe_validation.h
> @@ -7,9 +7,11 @@
>  
>  #include <linux/dma-resv.h>
>  #include <linux/types.h>
> +#include <linux/rwsem.h>
>  
>  struct drm_exec;
>  struct drm_gem_object;
> +struct drm_gpuvm_exec;
>  struct xe_device;
>  
>  #ifdef CONFIG_PROVE_LOCKING
> @@ -66,4 +68,109 @@ void xe_validation_assert_exec(const struct xe_device *xe, const struct drm_exec
>  	} while (0)
>  #endif
>  
> +/**
> + * struct xe_validation_device - The domain for exhaustive eviction
> + * @lock: The lock used to exclude other processes from allocating graphics memory
> + *
> + * The struct xe_validation_device represents the domain for which we want to use
> + * exhaustive eviction. The @lock is typically grabbed in read mode for allocations
> + * but when graphics memory allocation fails, it is retried with the write mode held.
> + */
> +struct xe_validation_device {
> +	struct rw_semaphore lock;
> +};
> +
> +/**
> + * struct xe_validation_ctx - A struct drm_exec subclass with support for
> + * exhaustive eviction
> + * @exec: The drm_exec object base class. Note that we use a pointer instead of
> + * embedding to avoid diamond inheritance.
> + * @val: The exhaustive eviction domain.
> + * @lock_held: Whether The domain lock is currently held.
> + * @lock_held_exclusive: Whether the domain lock is held in exclusive mode.
> + * @request_exclusive: Whether to lock exclusively (write mode) the next time
> + * the domain lock is locked.
> + * @flags: The drm_exec flags used for drm_exec (re-)initialization.
> + * @nr: The drm_exec nr parameter used for drm_exec (re-)initializaiton.
> + */
> +struct xe_validation_ctx {
> +	struct drm_exec *exec;
> +	struct xe_validation_device *val;
> +	bool lock_held;
> +	bool lock_held_exclusive;
> +	bool request_exclusive;
> +	u32 flags;
> +	unsigned int nr;
> +};
> +
> +int xe_validation_ctx_init(struct xe_validation_ctx *ctx, struct xe_validation_device *val,
> +			   struct drm_exec *exec, u32 flags, unsigned int nr,
> +			   bool exclusive);
> +
> +int xe_validation_exec_lock(struct xe_validation_ctx *ctx, struct drm_gpuvm_exec *vm_exec,
> +			    struct xe_validation_device *val);
> +
> +void xe_validation_ctx_fini(struct xe_validation_ctx *ctx);
> +
> +bool xe_validation_should_retry(struct xe_validation_ctx *ctx, int *ret);
> +
> +/**
> + * xe_validation_retry_on_oom() - Retry on oom in an xe_validaton transaction
> + * @_ctx: Pointer to the xe_validation_ctx
> + * @_ret: The current error value possibly holding -ENOMEM
> + *
> + * Use this in way similar to drm_exec_retry_on_contention().
> + * If @_ret contains -ENOMEM the tranaction is restarted once in a way that
> + * blocks other transactions and allows exhastive eviction. If the transaction
> + * was already restarted once, Just return the -ENOMEM. May also set
> + * _ret to -EINTR if not retrying and waits are interruptible.
> + * May only be used within a drm_exec_until_all_locked() loop.
> + */
> +#define xe_validation_retry_on_oom(_ctx, _ret)				\
> +	do {								\
> +		if (xe_validation_should_retry(_ctx, _ret))		\
> +			goto *__drm_exec_retry_ptr;			\
> +	} while (0)
> +
> +/**
> + * xe_validation_device_init - Initialize a struct xe_validation_device
> + * @val: The xe_validation_device to init.
> + */
> +static inline void
> +xe_validation_device_init(struct xe_validation_device *val)
> +{
> +	init_rwsem(&val->lock);
> +}
> +
> +/*
> + * Make guard() and scoped_guard() work with xe_validation_ctx
> + * so that we can exit transactions without caring about the
> + * cleanup.
> + */
> +DEFINE_CLASS(xe_validation, struct xe_validation_ctx *,
> +	     if (!IS_ERR(_T)) xe_validation_ctx_fini(_T);,
> +	     ({_ret = xe_validation_ctx_init(_ctx, _val, _exec, _flags, 0, _excl);
> +	       _ret ? NULL : _ctx; }),
> +	     struct xe_validation_ctx *_ctx, struct xe_validation_device *_val,
> +	     struct drm_exec *_exec, u32 _flags, int _ret, bool _excl);
> +static inline void *class_xe_validation_lock_ptr(class_xe_validation_t *_T)
> +{return *_T; }
> +#define class_xe_validation_is_conditional false
> +
> +/**
> + * xe_validation_guard() - An auto-cleanup xe_validation_ctx transaction
> + * @_ctx: The xe_validation_ctx.
> + * @_val: The xe_validation_device.
> + * @_exec: The struct drm_exec object
> + * @_flags: Flags for the drm_exec transaction. See the struct drm_exec documention!
> + * @_ret: Return in / out parameter. May be set by this macro. Typicall 0 when called.
> + * @_excl: Whether to start in exclusive mode already in the first iteration.
> + *

Same comment as above on function xe_validation_ctx_init wrt to
arguments.

Matt

> + * This macro is will initiate a drm_exec transaction with additional support for
> + * exhaustive eviction.
> + */
> +#define xe_validation_guard(_ctx, _val, _exec, _flags, _ret, _excl)	\
> +	scoped_guard(xe_validation, _ctx, _val, _exec, _flags, _ret, _excl) \
> +	drm_exec_until_all_locked(_exec)
> +
>  #endif
> -- 
> 2.50.1
> 


More information about the Intel-xe mailing list