[PATCH 05/15] drm/xe: Introduce an xe_validation wrapper around drm_exec

Fri Aug 15 15:04:12 UTC 2025

On Wed, 2025-08-13 at 10:25 -0700, Matthew Brost wrote:
> On Wed, Aug 13, 2025 at 12:51:11PM +0200, Thomas Hellström wrote:
> > Introduce a validation wrapper xe_validation_guard() as a helper
> > intended to be used around drm_exec transactions what perform
> > validations. Once TTM can handle exhaustive eviction we could
> > remove this wrapper or make it mostly a NO-OP unless other
> > functionality is added to it.
> > 
> > Currently the wrapper takes a read lock upon entry and if the
> > transaction hits an OOM, all locks are released and the
> > transaction is retried with a write-lock. If all other
> > validations participate in this scheme, the transaction with
> > the write lock will be the only transaction validating and
> > should have access to all available non-pinned memory.
> > 
> > There is currently a problem in that TTM converts -EDEADLOCKS to
> > -ENOMEM, and with ww_mutex slowpath error injections, we can hit
> > -ENOMEMs without having actually ran out of memory. We abuse
> > ww_mutex internals to detect such situations until TTM is fixes
> > to not convert the error code. In the meantime, injecting
> > ww_mutex slowpath -EDEADLOCKs is a good way to test
> > the implementation in the absence of real OOMs.
> > 
> > Just introduce the wrapper in this commit. It will be hooked up
> > to the driver in following commits.
> > 
> > Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_validation.c | 199
> > +++++++++++++++++++++++++++++
> >  drivers/gpu/drm/xe/xe_validation.h | 107 ++++++++++++++++
> >  2 files changed, 306 insertions(+)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_validation.c
> > b/drivers/gpu/drm/xe/xe_validation.c
> > index cc0684d24e02..cd1424f04237 100644
> > --- a/drivers/gpu/drm/xe/xe_validation.c
> > +++ b/drivers/gpu/drm/xe/xe_validation.c
> > @@ -5,6 +5,7 @@
> >  #include "xe_bo.h"
> >  #include <drm/drm_exec.h>
> >  #include <drm/drm_gem.h>
> > +#include <drm/drm_gpuvm.h>
> >  
> >  #include "xe_assert.h"
> >  #include "xe_validation.h"
> > @@ -47,3 +48,201 @@ void xe_validation_assert_exec(const struct
> > xe_device *xe,
> >  	}
> >  }
> >  #endif
> > +
> > +static int xe_validation_lock(struct xe_validation_ctx *ctx)
> > +{
> > +	struct xe_validation_device *val = ctx->val;
> > +	int ret = 0;
> > +
> > +	if (ctx->flags & DRM_EXEC_INTERRUPTIBLE_WAIT) {
> > +		if (ctx->request_exclusive)
> > +			ret = down_write_killable(&val->lock);
> > +		else
> > +			ret = down_read_interruptible(&val->lock);
> > +	} else {
> > +		if (ctx->request_exclusive)
> > +			down_write(&val->lock);
> > +		else
> > +			down_read(&val->lock);
> > +	}
> > +
> > +	if (!ret) {
> > +		ctx->lock_held = true;
> > +		ctx->lock_held_exclusive = ctx->request_exclusive;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static void xe_validation_unlock(struct xe_validation_ctx *ctx)
> > +{
> > +	if (!ctx->lock_held)
> > +		return;
> > +
> > +	if (ctx->lock_held_exclusive)
> > +		up_write(&ctx->val->lock);
> > +	else
> > +		up_read(&ctx->val->lock);
> > +
> > +	ctx->lock_held = false;
> > +}
> > +
> > +/**
> > + * xe_validation_ctx_init() - Initialize an xe_validation_ctx
> > + * @ctx: The xe_validation_ctx to initialize.
> > + * @val: The xe_validation_device representing the validation
> > domain.
> > + * @exec: The struct drm_exec to use for the transaction.
> > + * @flags: The flags to use for drm_exec initialization.
> > + * @nr: The number of anticipated buffer object locks. Forwarded
> > to
> > + * drm_exec initialization.
> > + * @exclusive: Whether to use exclusive locking already on first
> > validation.
> > + *
> > + * Initialize and lock a an xe_validation transaction using the
> > validation domain
> > + * represented by @val. Also initialize the drm_exec object
> > forwarding
> > + * @flags and @nr to the drm_exec initialization. The @exclusive
> > parameter should
> > + * typically be set to false to avoid locking out other validators
> > from the
> > + * domain until an OOM is hit. For testing- or final attempt
> > purposes it can,
> > + * however, be set to true.
> > + *
> > + * Return: %0 on success, %-EINTR if interruptible initial locking
> > failed with a
> > + * signal pending.
> > + */
> > +int xe_validation_ctx_init(struct xe_validation_ctx *ctx, struct
> > xe_validation_device *val,
> > +			   struct drm_exec *exec, u32 flags,
> > unsigned int nr,
> > +			   bool exclusive)
> > +{
> > +	int ret;
> > +
> > +	ctx->exec = exec;
> > +	ctx->val = val;
> > +	ctx->lock_held = false;
> > +	ctx->lock_held_exclusive = false;
> > +	ctx->request_exclusive = exclusive;
> > +	ctx->flags = flags;
> > +	ctx->nr = nr;
> > +
> > +	ret = xe_validation_lock(ctx);
> > +	if (ret)
> > +		return ret;
> > +
> > +	drm_exec_init(exec, flags, nr);
> > +
> > +	return 0;
> > +}
> > +
> > +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
> > +/*
> > + * This abuses both drm_exec and ww_mutex internals and should be
> > + * replaced by checking for -EDEADLK when we can make TTM
> > + * stop converting -EDEADLK to -ENOMEM.
> > + * An alternative is to not have exhaustive eviction with
> > + * CONFIG_DEBUG_WW_MUTEX_SLOWPATH until that happens.
> > + */
> 
> I vote to keep this the way you have it and live with the abuse until
> TTM is updated.
> 
> > +static bool xe_validation_contention_injected(struct drm_exec
> > *exec)
> > +{
> > +	return !!exec->ticket.contending_lock;
> > +}
> > +
> > +#else
> > +
> > +static bool xe_validation_contention_injected(struct drm_exec
> > *exec)
> > +{
> > +	return false;
> > +}
> > +
> > +#endif
> > +
> > +static bool __xe_validation_should_retry(struct xe_validation_ctx
> > *ctx, int ret)
> > +{
> > +	if (ret == -ENOMEM &&
> > +	    ((ctx->request_exclusive &&
> > +	      xe_validation_contention_injected(ctx->exec)) ||
> > +	     !ctx->request_exclusive)) {
> > +		ctx->request_exclusive = true;
> 
> Can the locking across multiple GPUs fall when request_exclusive is
> held
> and the GPUs are sharing dma-buffers? I suppose we'd need true WW
> locking throughout the stack (TTM) and a ticketed retry to guarantee
> foward progress.

Yes. Either that or widen the validation domain to be per driver rather
than per device.
With the drm_exec retry, in addition dma-buf needs to support passing a
drm_exec, but that otoh would be a layering violation. So I have
historically advocated for abstracting the ww mutex transaction with a
dma-buf locking context with callbacks rather than a drm_exec. Then
drm_exec could derive from that and keep its functionality for gem
objects.

> 
> > +		return true;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +/**
> > + * xe_validation_exec_lock() - Perform drm_gpuvm_exec_lock within
> > a validation
> > + * transaction.
> > + * @ctx: An uninitialized xe_validation_ctx.
> > + * @vm_exec: An initialized struct vm_exec.
> > + * @val: The validation domain.
> > + *
> > + * The drm_gpuvm_exec_lock() function internally initializes its
> > drm_exec
> > + * transaction and therefore doesn't lend itself very well to be
> > using
> > + * xe_validation_ctx_init(). Provide a helper that takes an
> > uninitialized
> > + * xe_validation_ctx and calls drm_gpuvm_exec_lock() with OOM
> > retry.
> > + *
> > + * Return: %0 on success, negative error code on failure.
> > + */
> > +int xe_validation_exec_lock(struct xe_validation_ctx *ctx,
> > +			    struct drm_gpuvm_exec *vm_exec,
> > +			    struct xe_validation_device *val)
> > +{
> > +	int ret;
> > +
> > +	memset(ctx, 0, sizeof(*ctx));
> > +	ctx->exec = &vm_exec->exec;
> > +	ctx->flags = vm_exec->flags;
> > +	ctx->val = val;
> > +retry:
> > +	ret = xe_validation_lock(ctx);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = drm_gpuvm_exec_lock(vm_exec);
> > +	if (ret) {
> > +		xe_validation_unlock(ctx);
> > +		if (__xe_validation_should_retry(ctx, ret))
> > +			goto retry;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +/**
> > + * xe_validation_ctx_fini() - Finalize a validation transaction
> > + * @ctx: The Validation transaction to finalize.
> > + *
> > + * Finalize a validation transaction and its related drm_exec
> > transaction.
> > + */
> > +void xe_validation_ctx_fini(struct xe_validation_ctx *ctx)
> > +{
> > +	drm_exec_fini(ctx->exec);
> > +	xe_validation_unlock(ctx);
> > +}
> > +
> > +/**
> > + * xe_validation_should_retry() - Determine if a validation
> > transaction should retry
> > + * @ctx: The validation transaction.
> > + * @ret: Pointer to a return value variable.
> > + *
> > + * Determines whether a validation transaction should retry based
> > on the
> > + * internal transaction state and the return value pointed to by
> > @ret.
> > + * If a validation should be retried, the transaction is prepared
> > for that,
> > + * and the validation locked might be re-locked in exclusive mode,
> > and *@ret
> > + * is set to %0. If the re-locking errors, typically due to
> > interruptible
> > + * locking with signal pending, *@ret is instead set to -EINTR and
> > the
> > + * function returns %false.
> > + *
> > + * Return: %true if validation should be retried, %false
> > otherwise.
> > + */
> > +bool xe_validation_should_retry(struct xe_validation_ctx *ctx, int
> > *ret)
> > +{
> > +	if (__xe_validation_should_retry(ctx, *ret)) {
> > +		drm_exec_fini(ctx->exec);
> > +		*ret = 0;
> > +		if (ctx->request_exclusive != ctx-
> > >lock_held_exclusive) {
> > +			xe_validation_unlock(ctx);
> > +			*ret = xe_validation_lock(ctx);
> > +		}
> > +		drm_exec_init(ctx->exec, ctx->flags, ctx->nr);
> > +		return !*ret;
> > +	}
> > +
> > +	return false;
> > +}
> > diff --git a/drivers/gpu/drm/xe/xe_validation.h
> > b/drivers/gpu/drm/xe/xe_validation.h
> > index db50feacad7a..a708c260cf18 100644
> > --- a/drivers/gpu/drm/xe/xe_validation.h
> > +++ b/drivers/gpu/drm/xe/xe_validation.h
> > @@ -7,9 +7,11 @@
> >  
> >  #include <linux/dma-resv.h>
> >  #include <linux/types.h>
> > +#include <linux/rwsem.h>
> >  
> >  struct drm_exec;
> >  struct drm_gem_object;
> > +struct drm_gpuvm_exec;
> >  struct xe_device;
> >  
> >  #ifdef CONFIG_PROVE_LOCKING
> > @@ -66,4 +68,109 @@ void xe_validation_assert_exec(const struct
> > xe_device *xe, const struct drm_exec
> >  	} while (0)
> >  #endif
> >  
> > +/**
> > + * struct xe_validation_device - The domain for exhaustive
> > eviction
> > + * @lock: The lock used to exclude other processes from allocating
> > graphics memory
> > + *
> > + * The struct xe_validation_device represents the domain for which
> > we want to use
> > + * exhaustive eviction. The @lock is typically grabbed in read
> > mode for allocations
> > + * but when graphics memory allocation fails, it is retried with
> > the write mode held.
> > + */
> > +struct xe_validation_device {
> > +	struct rw_semaphore lock;
> > +};
> > +
> > +/**
> > + * struct xe_validation_ctx - A struct drm_exec subclass with
> > support for
> > + * exhaustive eviction
> > + * @exec: The drm_exec object base class. Note that we use a
> > pointer instead of
> > + * embedding to avoid diamond inheritance.
> > + * @val: The exhaustive eviction domain.
> > + * @lock_held: Whether The domain lock is currently held.
> > + * @lock_held_exclusive: Whether the domain lock is held in
> > exclusive mode.
> > + * @request_exclusive: Whether to lock exclusively (write mode)
> > the next time
> > + * the domain lock is locked.
> > + * @flags: The drm_exec flags used for drm_exec (re-
> > )initialization.
> > + * @nr: The drm_exec nr parameter used for drm_exec (re-
> > )initializaiton.
> > + */
> > +struct xe_validation_ctx {
> > +	struct drm_exec *exec;
> > +	struct xe_validation_device *val;
> > +	bool lock_held;
> > +	bool lock_held_exclusive;
> > +	bool request_exclusive;
> > +	u32 flags;
> > +	unsigned int nr;
> > +};
> > +
> > +int xe_validation_ctx_init(struct xe_validation_ctx *ctx, struct
> > xe_validation_device *val,
> > +			   struct drm_exec *exec, u32 flags,
> > unsigned int nr,
> > +			   bool exclusive);
> > +
> > +int xe_validation_exec_lock(struct xe_validation_ctx *ctx, struct
> > drm_gpuvm_exec *vm_exec,
> > +			    struct xe_validation_device *val);
> > +
> > +void xe_validation_ctx_fini(struct xe_validation_ctx *ctx);
> > +
> > +bool xe_validation_should_retry(struct xe_validation_ctx *ctx, int
> > *ret);
> > +
> > +/**
> > + * xe_validation_retry_on_oom() - Retry on oom in an xe_validaton
> > transaction
> > + * @_ctx: Pointer to the xe_validation_ctx
> > + * @_ret: The current error value possibly holding -ENOMEM
> > + *
> > + * Use this in way similar to drm_exec_retry_on_contention().
> > + * If @_ret contains -ENOMEM the tranaction is restarted once in a
> > way that
> > + * blocks other transactions and allows exhastive eviction. If the
> > transaction
> > + * was already restarted once, Just return the -ENOMEM. May also
> > set
> > + * _ret to -EINTR if not retrying and waits are interruptible.
> > + * May only be used within a drm_exec_until_all_locked() loop.
> > + */
> > +#define xe_validation_retry_on_oom(_ctx,
> > _ret)				\
> > +	do
> > {								\
> > +		if (xe_validation_should_retry(_ctx,
> > _ret))		\
> > +			goto
> > *__drm_exec_retry_ptr;			\
> > +	} while (0)
> > +
> > +/**
> > + * xe_validation_device_init - Initialize a struct
> > xe_validation_device
> > + * @val: The xe_validation_device to init.
> > + */
> > +static inline void
> > +xe_validation_device_init(struct xe_validation_device *val)
> > +{
> > +	init_rwsem(&val->lock);
> > +}
> > +
> > +/*
> > + * Make guard() and scoped_guard() work with xe_validation_ctx
> > + * so that we can exit transactions without caring about the
> > + * cleanup.
> > + */
> > +DEFINE_CLASS(xe_validation, struct xe_validation_ctx *,
> > +	     if (!IS_ERR(_T)) xe_validation_ctx_fini(_T);,
> 
> I think this should be 'if (_T)', right?
> 
> > +	     ({_ret = xe_validation_ctx_init(_ctx, _val, _exec,
> > _flags, 0, _excl);
> > +	       _ret ? NULL : _ctx; }),
> 
> Or here '_ret ? ERR_PTR(_ret) : _ctx;'
> 
> One or the other if I correctly understand how DEFINE_CLASS works.

Right. good catch.

> 
> Matt
> 
> > +	     struct xe_validation_ctx *_ctx, struct
> > xe_validation_device *_val,
> > +	     struct drm_exec *_exec, u32 _flags, int _ret, bool
> > _excl);
> > +static inline void
> > *class_xe_validation_lock_ptr(class_xe_validation_t *_T)
> > +{return *_T; }
> > +#define class_xe_validation_is_conditional false
> > +
> > +/**
> > + * xe_validation_guard() - An auto-cleanup xe_validation_ctx
> > transaction
> > + * @_ctx: The xe_validation_ctx.
> > + * @_val: The xe_validation_device.
> > + * @_exec: The struct drm_exec object
> > + * @_flags: Flags for the drm_exec transaction. See the struct
> > drm_exec documention!
> > + * @_ret: Return in / out parameter. May be set by this macro.
> > Typicall 0 when called.
> > + * @_excl: Whether to start in exclusive mode already in the first
> > iteration.
> > + *
> > + * This macro is will initiate a drm_exec transaction with
> > additional support for
> > + * exhaustive eviction.
> > + */
> > +#define xe_validation_guard(_ctx, _val, _exec, _flags, _ret,
> > _excl)	\
> > +	scoped_guard(xe_validation, _ctx, _val, _exec, _flags,
> > _ret, _excl) \
> > +	drm_exec_until_all_locked(_exec)
> > +
> >  #endif
> > -- 
> > 2.50.1
> >