[PATCH v2 05/16] drm/xe: Introduce an xe_validation wrapper around drm_exec

Thomas Hellström thomas.hellstrom at linux.intel.com
Fri Aug 22 09:40:19 UTC 2025


Introduce a validation wrapper xe_validation_guard() as a helper
intended to be used around drm_exec transactions what perform
validations. Once TTM can handle exhaustive eviction we could
remove this wrapper or make it mostly a NO-OP unless other
functionality is added to it.

Currently the wrapper takes a read lock upon entry and if the
transaction hits an OOM, all locks are released and the
transaction is retried with a write-lock. If all other
validations participate in this scheme, the transaction with
the write lock will be the only transaction validating and
should have access to all available non-pinned memory.

There is currently a problem in that TTM converts -EDEADLOCKS to
-ENOMEM, and with ww_mutex slowpath error injections, we can hit
-ENOMEMs without having actually ran out of memory. We abuse
ww_mutex internals to detect such situations until TTM is fixes
to not convert the error code. In the meantime, injecting
ww_mutex slowpath -EDEADLOCKs is a good way to test
the implementation in the absence of real OOMs.

Just introduce the wrapper in this commit. It will be hooked up
to the driver in following commits.

v2:
- Mark class_xe_validation conditional so that the loop is
  skipped on initialization error.
- Argument sanitation (Matt Brost)
- Fix conditional execution of xe_validation_ctx_fini()
  (Matt Brost)
- Add a no_block mode for upcoming use in the CPU fault handler.

Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
---
 drivers/gpu/drm/xe/xe_validation.c | 228 +++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_validation.h | 122 +++++++++++++++
 2 files changed, 350 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_validation.c b/drivers/gpu/drm/xe/xe_validation.c
index cc0684d24e02..b90fda3dd5f4 100644
--- a/drivers/gpu/drm/xe/xe_validation.c
+++ b/drivers/gpu/drm/xe/xe_validation.c
@@ -5,6 +5,7 @@
 #include "xe_bo.h"
 #include <drm/drm_exec.h>
 #include <drm/drm_gem.h>
+#include <drm/drm_gpuvm.h>
 
 #include "xe_assert.h"
 #include "xe_validation.h"
@@ -47,3 +48,230 @@ void xe_validation_assert_exec(const struct xe_device *xe,
 	}
 }
 #endif
+
+static int xe_validation_lock(struct xe_validation_ctx *ctx)
+{
+	struct xe_validation_device *val = ctx->val;
+	int ret = 0;
+
+	if (ctx->val_flags.interruptible) {
+		if (ctx->request_exclusive)
+			ret = down_write_killable(&val->lock);
+		else
+			ret = down_read_interruptible(&val->lock);
+	} else {
+		if (ctx->request_exclusive)
+			down_write(&val->lock);
+		else
+			down_read(&val->lock);
+	}
+
+	if (!ret) {
+		ctx->lock_held = true;
+		ctx->lock_held_exclusive = ctx->request_exclusive;
+	}
+
+	return ret;
+}
+
+static int xe_validation_trylock(struct xe_validation_ctx *ctx)
+{
+	struct xe_validation_device *val = ctx->val;
+	bool locked;
+
+	if (ctx->request_exclusive)
+		locked = down_write_trylock(&val->lock);
+	else
+		locked = down_read_trylock(&val->lock);
+
+	if (locked) {
+		ctx->lock_held = true;
+		ctx->lock_held_exclusive = ctx->request_exclusive;
+	}
+
+	return locked ? 0 : -EWOULDBLOCK;
+}
+
+static void xe_validation_unlock(struct xe_validation_ctx *ctx)
+{
+	if (!ctx->lock_held)
+		return;
+
+	if (ctx->lock_held_exclusive)
+		up_write(&ctx->val->lock);
+	else
+		up_read(&ctx->val->lock);
+
+	ctx->lock_held = false;
+}
+
+/**
+ * xe_validation_ctx_init() - Initialize an xe_validation_ctx
+ * @ctx: The xe_validation_ctx to initialize.
+ * @val: The xe_validation_device representing the validation domain.
+ * @exec: The struct drm_exec to use for the transaction. May be NULL.
+ * @flags: The flags to use for initialization.
+ *
+ * Initialize and lock a an xe_validation transaction using the validation domain
+ * represented by @val. Also initialize the drm_exec object forwarding parts of
+ * @flags to the drm_exec initialization. The @flags.exclusive flag should
+ * typically be set to false to avoid locking out other validators from the
+ * domain until an OOM is hit. For testing- or final attempt purposes it can,
+ * however, be set to true.
+ *
+ * Return: %0 on success, %-EINTR if interruptible initial locking failed with a
+ * signal pending. If @flags.no_block is set to true, a failed trylock
+ * returns %-EWOULDBLOCK.
+ */
+int xe_validation_ctx_init(struct xe_validation_ctx *ctx, struct xe_validation_device *val,
+			   struct drm_exec *exec, const struct xe_val_flags flags)
+{
+	int ret;
+
+	ctx->exec = exec;
+	ctx->val = val;
+	ctx->lock_held = false;
+	ctx->lock_held_exclusive = false;
+	ctx->request_exclusive = flags.exclusive;
+	ctx->val_flags = flags;
+	ctx->exec_flags = 0;
+	ctx->nr = 0;
+
+	if (flags.no_block)
+		ret = xe_validation_trylock(ctx);
+	else
+		ret = xe_validation_lock(ctx);
+	if (ret)
+		return ret;
+
+	if (exec) {
+		if (flags.interruptible)
+			ctx->exec_flags |= DRM_EXEC_INTERRUPTIBLE_WAIT;
+		if (flags.exec_ignore_duplicates)
+			ctx->exec_flags |= DRM_EXEC_IGNORE_DUPLICATES;
+		drm_exec_init(exec, ctx->exec_flags, ctx->nr);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+/*
+ * This abuses both drm_exec and ww_mutex internals and should be
+ * replaced by checking for -EDEADLK when we can make TTM
+ * stop converting -EDEADLK to -ENOMEM.
+ * An alternative is to not have exhaustive eviction with
+ * CONFIG_DEBUG_WW_MUTEX_SLOWPATH until that happens.
+ */
+static bool xe_validation_contention_injected(struct drm_exec *exec)
+{
+	return !!exec->ticket.contending_lock;
+}
+
+#else
+
+static bool xe_validation_contention_injected(struct drm_exec *exec)
+{
+	return false;
+}
+
+#endif
+
+static bool __xe_validation_should_retry(struct xe_validation_ctx *ctx, int ret)
+{
+	if (ret == -ENOMEM &&
+	    ((ctx->request_exclusive &&
+	      xe_validation_contention_injected(ctx->exec)) ||
+	     !ctx->request_exclusive)) {
+		ctx->request_exclusive = true;
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * xe_validation_exec_lock() - Perform drm_gpuvm_exec_lock within a validation
+ * transaction.
+ * @ctx: An uninitialized xe_validation_ctx.
+ * @vm_exec: An initialized struct vm_exec.
+ * @val: The validation domain.
+ *
+ * The drm_gpuvm_exec_lock() function internally initializes its drm_exec
+ * transaction and therefore doesn't lend itself very well to be using
+ * xe_validation_ctx_init(). Provide a helper that takes an uninitialized
+ * xe_validation_ctx and calls drm_gpuvm_exec_lock() with OOM retry.
+ *
+ * Return: %0 on success, negative error code on failure.
+ */
+int xe_validation_exec_lock(struct xe_validation_ctx *ctx,
+			    struct drm_gpuvm_exec *vm_exec,
+			    struct xe_validation_device *val)
+{
+	int ret;
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->exec = &vm_exec->exec;
+	ctx->exec_flags = vm_exec->flags;
+	ctx->val = val;
+	if (ctx->exec_flags & DRM_EXEC_INTERRUPTIBLE_WAIT)
+		ctx->val_flags.interruptible = 1;
+	if (ctx->exec_flags & DRM_EXEC_IGNORE_DUPLICATES)
+		ctx->val_flags.exec_ignore_duplicates = 1;
+retry:
+	ret = xe_validation_lock(ctx);
+	if (ret)
+		return ret;
+
+	ret = drm_gpuvm_exec_lock(vm_exec);
+	if (ret) {
+		xe_validation_unlock(ctx);
+		if (__xe_validation_should_retry(ctx, ret))
+			goto retry;
+	}
+
+	return ret;
+}
+
+/**
+ * xe_validation_ctx_fini() - Finalize a validation transaction
+ * @ctx: The Validation transaction to finalize.
+ *
+ * Finalize a validation transaction and its related drm_exec transaction.
+ */
+void xe_validation_ctx_fini(struct xe_validation_ctx *ctx)
+{
+	drm_exec_fini(ctx->exec);
+	xe_validation_unlock(ctx);
+}
+
+/**
+ * xe_validation_should_retry() - Determine if a validation transaction should retry
+ * @ctx: The validation transaction.
+ * @ret: Pointer to a return value variable.
+ *
+ * Determines whether a validation transaction should retry based on the
+ * internal transaction state and the return value pointed to by @ret.
+ * If a validation should be retried, the transaction is prepared for that,
+ * and the validation locked might be re-locked in exclusive mode, and *@ret
+ * is set to %0. If the re-locking errors, typically due to interruptible
+ * locking with signal pending, *@ret is instead set to -EINTR and the
+ * function returns %false.
+ *
+ * Return: %true if validation should be retried, %false otherwise.
+ */
+bool xe_validation_should_retry(struct xe_validation_ctx *ctx, int *ret)
+{
+	if (__xe_validation_should_retry(ctx, *ret)) {
+		drm_exec_fini(ctx->exec);
+		*ret = 0;
+		if (ctx->request_exclusive != ctx->lock_held_exclusive) {
+			xe_validation_unlock(ctx);
+			*ret = xe_validation_lock(ctx);
+		}
+		drm_exec_init(ctx->exec, ctx->exec_flags, ctx->nr);
+		return !*ret;
+	}
+
+	return false;
+}
diff --git a/drivers/gpu/drm/xe/xe_validation.h b/drivers/gpu/drm/xe/xe_validation.h
index db50feacad7a..36860974165e 100644
--- a/drivers/gpu/drm/xe/xe_validation.h
+++ b/drivers/gpu/drm/xe/xe_validation.h
@@ -7,9 +7,11 @@
 
 #include <linux/dma-resv.h>
 #include <linux/types.h>
+#include <linux/rwsem.h>
 
 struct drm_exec;
 struct drm_gem_object;
+struct drm_gpuvm_exec;
 struct xe_device;
 
 #ifdef CONFIG_PROVE_LOCKING
@@ -66,4 +68,124 @@ void xe_validation_assert_exec(const struct xe_device *xe, const struct drm_exec
 	} while (0)
 #endif
 
+/**
+ * struct xe_validation_device - The domain for exhaustive eviction
+ * @lock: The lock used to exclude other processes from allocating graphics memory
+ *
+ * The struct xe_validation_device represents the domain for which we want to use
+ * exhaustive eviction. The @lock is typically grabbed in read mode for allocations
+ * but when graphics memory allocation fails, it is retried with the write mode held.
+ */
+struct xe_validation_device {
+	struct rw_semaphore lock;
+};
+
+/**
+ * struct xe_val_flags - Flags for xe_validation_ctx_init().
+ * @exclusive: Start the validation transaction by locking out all other validators.
+ * @no_block:  Don't block on initialization.
+ * @interruptible: Block interruptible if blocking. Implies initializing the drm_exec
+ * context with the DRM_EXEC_INTERRUPTIBLE_WAIT flag.
+ * @exec_ignore_dupllicates: Initialize the drm_exec context with the
+ * DRM_EXEC_IGNORE_DUPLICATES flag.
+ */
+struct xe_val_flags {
+	u32 exclusive :1;
+	u32 no_block :1;
+	u32 interruptible :1;
+	u32 exec_ignore_duplicates :1;
+};
+
+/**
+ * struct xe_validation_ctx - A struct drm_exec subclass with support for
+ * exhaustive eviction
+ * @exec: The drm_exec object base class. Note that we use a pointer instead of
+ * embedding to avoid diamond inheritance.
+ * @val: The exhaustive eviction domain.
+ * @lock_held: Whether The domain lock is currently held.
+ * @lock_held_exclusive: Whether the domain lock is held in exclusive mode.
+ * @request_exclusive: Whether to lock exclusively (write mode) the next time
+ * the domain lock is locked.
+ * @flags: The drm_exec flags used for drm_exec (re-)initialization.
+ * @nr: The drm_exec nr parameter used for drm_exec (re-)initializaiton.
+ */
+struct xe_validation_ctx {
+	struct drm_exec *exec;
+	struct xe_validation_device *val;
+	struct xe_val_flags val_flags;
+	bool lock_held;
+	bool lock_held_exclusive;
+	bool request_exclusive;
+	u32 exec_flags;
+	unsigned int nr;
+};
+
+int xe_validation_ctx_init(struct xe_validation_ctx *ctx, struct xe_validation_device *val,
+			   struct drm_exec *exec, const struct xe_val_flags flags);
+
+int xe_validation_exec_lock(struct xe_validation_ctx *ctx, struct drm_gpuvm_exec *vm_exec,
+			    struct xe_validation_device *val);
+
+void xe_validation_ctx_fini(struct xe_validation_ctx *ctx);
+
+bool xe_validation_should_retry(struct xe_validation_ctx *ctx, int *ret);
+
+/**
+ * xe_validation_retry_on_oom() - Retry on oom in an xe_validaton transaction
+ * @_ctx: Pointer to the xe_validation_ctx
+ * @_ret: The current error value possibly holding -ENOMEM
+ *
+ * Use this in way similar to drm_exec_retry_on_contention().
+ * If @_ret contains -ENOMEM the tranaction is restarted once in a way that
+ * blocks other transactions and allows exhastive eviction. If the transaction
+ * was already restarted once, Just return the -ENOMEM. May also set
+ * _ret to -EINTR if not retrying and waits are interruptible.
+ * May only be used within a drm_exec_until_all_locked() loop.
+ */
+#define xe_validation_retry_on_oom(_ctx, _ret)				\
+	do {								\
+		if (xe_validation_should_retry(_ctx, _ret))		\
+			goto *__drm_exec_retry_ptr;			\
+	} while (0)
+
+/**
+ * xe_validation_device_init - Initialize a struct xe_validation_device
+ * @val: The xe_validation_device to init.
+ */
+static inline void
+xe_validation_device_init(struct xe_validation_device *val)
+{
+	init_rwsem(&val->lock);
+}
+
+/*
+ * Make guard() and scoped_guard() work with xe_validation_ctx
+ * so that we can exit transactions without caring about the
+ * cleanup.
+ */
+DEFINE_CLASS(xe_validation, struct xe_validation_ctx *,
+	     if (_T) xe_validation_ctx_fini(_T);,
+	     ({_ret = xe_validation_ctx_init(_ctx, _val, _exec, _flags);
+	       _ret ? NULL : _ctx; }),
+	     struct xe_validation_ctx *_ctx, struct xe_validation_device *_val,
+	     struct drm_exec *_exec, const struct xe_val_flags _flags, int _ret);
+static inline void *class_xe_validation_lock_ptr(class_xe_validation_t *_T)
+{return *_T; }
+#define class_xe_validation_is_conditional true
+
+/**
+ * xe_validation_guard() - An auto-cleanup xe_validation_ctx transaction
+ * @_ctx: The xe_validation_ctx.
+ * @_val: The xe_validation_device.
+ * @_exec: The struct drm_exec object
+ * @_flags: Flags for the xe_validation_ctx initialization.
+ * @_ret: Return in / out parameter. May be set by this macro. Typicall 0 when called.
+ *
+ * This macro is will initiate a drm_exec transaction with additional support for
+ * exhaustive eviction.
+ */
+#define xe_validation_guard(_ctx, _val, _exec, _flags, _ret)		\
+	scoped_guard(xe_validation, _ctx, _val, _exec, _flags, _ret) \
+	drm_exec_until_all_locked(_exec)
+
 #endif
-- 
2.50.1



More information about the Intel-xe mailing list