[PATCH v3] drm/xe: Add timeout to preempt fences
Niranjana Vishwanathapura
niranjana.vishwanathapura at intel.com
Thu Jun 27 07:08:37 UTC 2024
On Tue, Jun 25, 2024 at 05:41:37PM -0700, Matthew Brost wrote:
>To adhere to dma fencing rules that fences must signal within a
>reasonable amount of time, add a 5 second timeout to preempt fences. If
>this timeout occurs, kill the associated VM as this fatal to the VM.
>
>v2:
> - Add comment for smp_wmb (Checkpatch)
> - Fix kernel doc typo (Inspection)
> - Add comment for killed check (Niranjana)
>v3:
> - Drop smp_wmb (Matthew Auld)
> - Don't take vm->lock in preempt fence worker (Matthew Auld)
> - Drop RB given changes to patch
>
>Cc: Matthew Auld <matthew.auld at intel.com>
>Cc: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
>Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>---
> drivers/gpu/drm/xe/xe_exec_queue_types.h | 6 ++--
> drivers/gpu/drm/xe/xe_execlist.c | 3 +-
> drivers/gpu/drm/xe/xe_guc_submit.c | 39 ++++++++++++++++++++----
> drivers/gpu/drm/xe/xe_preempt_fence.c | 12 ++++++--
> drivers/gpu/drm/xe/xe_vm.c | 14 +++++++--
> drivers/gpu/drm/xe/xe_vm.h | 2 ++
> 6 files changed, 62 insertions(+), 14 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>index 201588ec33c3..ded9f9396429 100644
>--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
>+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
>@@ -172,9 +172,11 @@ struct xe_exec_queue_ops {
> int (*suspend)(struct xe_exec_queue *q);
> /**
> * @suspend_wait: Wait for an exec queue to suspend executing, should be
>- * call after suspend.
>+ * call after suspend. In dma-fencing path thus must return within a
>+ * reasonable amount of time. -ETIME return shall indicate an error
>+ * waiting for suspend resulting in associated VM getting killed.
> */
>- void (*suspend_wait)(struct xe_exec_queue *q);
>+ int (*suspend_wait)(struct xe_exec_queue *q);
> /**
> * @resume: Resume exec queue execution, exec queue must be in a suspended
> * state and dma fence returned from most recent suspend call must be
>diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
>index db906117db6d..7502e3486eaf 100644
>--- a/drivers/gpu/drm/xe/xe_execlist.c
>+++ b/drivers/gpu/drm/xe/xe_execlist.c
>@@ -422,10 +422,11 @@ static int execlist_exec_queue_suspend(struct xe_exec_queue *q)
> return 0;
> }
>
>-static void execlist_exec_queue_suspend_wait(struct xe_exec_queue *q)
>+static int execlist_exec_queue_suspend_wait(struct xe_exec_queue *q)
>
> {
> /* NIY */
>+ return 0;
> }
>
> static void execlist_exec_queue_resume(struct xe_exec_queue *q)
>diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
>index 373447758a60..b06321897cf3 100644
>--- a/drivers/gpu/drm/xe/xe_guc_submit.c
>+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
>@@ -1301,6 +1301,15 @@ static void __guc_exec_queue_process_msg_set_sched_props(struct xe_sched_msg *ms
> kfree(msg);
> }
>
>+static void __suspend_fence_signal(struct xe_exec_queue *q)
>+{
>+ if (!q->guc->suspend_pending)
>+ return;
>+
>+ q->guc->suspend_pending = false;
Hmm...don't we need WRITE_ONCE here to write to suspend_pending flag
(along with READ_ONCE() elsewhere) given it is also checked in guc
callback handler etc? Possible that it might not be an issue on out
our platforms though. In any case, this patch is not causing this,
so, I am fine here.
>+ wake_up(&q->guc->suspend_wait);
>+}
>+
> static void suspend_fence_signal(struct xe_exec_queue *q)
> {
> struct xe_guc *guc = exec_queue_to_guc(q);
>@@ -1310,9 +1319,7 @@ static void suspend_fence_signal(struct xe_exec_queue *q)
> guc_read_stopped(guc));
> xe_assert(xe, q->guc->suspend_pending);
>
>- q->guc->suspend_pending = false;
>- smp_wmb();
>- wake_up(&q->guc->suspend_wait);
>+ __suspend_fence_signal(q);
> }
>
> static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg)
>@@ -1465,6 +1472,7 @@ static void guc_exec_queue_kill(struct xe_exec_queue *q)
> {
> trace_xe_exec_queue_kill(q);
> set_exec_queue_killed(q);
>+ __suspend_fence_signal(q);
> xe_guc_exec_queue_trigger_cleanup(q);
> }
>
>@@ -1561,12 +1569,31 @@ static int guc_exec_queue_suspend(struct xe_exec_queue *q)
> return 0;
> }
>
>-static void guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
>+static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
> {
> struct xe_guc *guc = exec_queue_to_guc(q);
>+ int ret;
>+
>+ /*
>+ * Likely don't need to check exec_queue_killed() as we clear
>+ * suspend_pending upon kill but to be paranoid but races in which
>+ * suspend_pending is set after kill also check kill here.
>+ */
>+ ret = wait_event_timeout(q->guc->suspend_wait,
>+ !q->guc->suspend_pending ||
>+ exec_queue_killed(q) ||
>+ guc_read_stopped(guc),
>+ HZ * 5);
>
>- wait_event(q->guc->suspend_wait, !q->guc->suspend_pending ||
>- guc_read_stopped(guc));
>+ if (!ret) {
>+ xe_gt_warn(guc_to_gt(guc),
>+ "Suspend fence, guc_id=%d, failed to respond",
>+ q->guc->id);
>+ /* XXX: Trigger GT reset? */
>+ return -ETIME;
>+ }
>+
>+ return 0;
> }
>
> static void guc_exec_queue_resume(struct xe_exec_queue *q)
>diff --git a/drivers/gpu/drm/xe/xe_preempt_fence.c b/drivers/gpu/drm/xe/xe_preempt_fence.c
>index e8b8ae5c6485..56e709d2fb30 100644
>--- a/drivers/gpu/drm/xe/xe_preempt_fence.c
>+++ b/drivers/gpu/drm/xe/xe_preempt_fence.c
>@@ -17,10 +17,16 @@ static void preempt_fence_work_func(struct work_struct *w)
> container_of(w, typeof(*pfence), preempt_work);
> struct xe_exec_queue *q = pfence->q;
>
>- if (pfence->error)
>+ if (pfence->error) {
> dma_fence_set_error(&pfence->base, pfence->error);
>- else
>- q->ops->suspend_wait(q);
>+ } else if (!q->ops->reset_status(q)) {
>+ int err = q->ops->suspend_wait(q);
>+
>+ if (err)
>+ dma_fence_set_error(&pfence->base, err);
>+ } else {
>+ dma_fence_set_error(&pfence->base, -ENOENT);
>+ }
>
> dma_fence_signal(&pfence->base);
> /*
>diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
>index 5b166fa03684..bf1764249724 100644
>--- a/drivers/gpu/drm/xe/xe_vm.c
>+++ b/drivers/gpu/drm/xe/xe_vm.c
>@@ -133,8 +133,10 @@ static int wait_for_existing_preempt_fences(struct xe_vm *vm)
> if (q->lr.pfence) {
> long timeout = dma_fence_wait(q->lr.pfence, false);
>
>- if (timeout < 0)
>+ /* Only -ETIME on fence indicates VM needs to be killed */
>+ if (timeout < 0 || q->lr.pfence->error == -ETIME)
> return -ETIME;
>+
> dma_fence_put(q->lr.pfence);
> q->lr.pfence = NULL;
> }
>@@ -311,7 +313,15 @@ int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
>
> #define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
>
>-static void xe_vm_kill(struct xe_vm *vm, bool unlocked)
>+/**
>+ * xe_vm_kill() - VM Kill
>+ * @vm: The VM.
>+ * @unlocked: Flag indicates the VM's dma-resv is not held
>+ *
>+ * Kill the VM by setting banned flag indicated VM is no longer available for
>+ * use. If in preempt fence mode, also kill all exec queue attached to the VM.
>+ */
>+void xe_vm_kill(struct xe_vm *vm, bool unlocked)
> {
> struct xe_exec_queue *q;
>
>diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
>index b481608b12f1..c864dba35e1d 100644
>--- a/drivers/gpu/drm/xe/xe_vm.h
>+++ b/drivers/gpu/drm/xe/xe_vm.h
>@@ -259,6 +259,8 @@ static inline struct dma_resv *xe_vm_resv(struct xe_vm *vm)
> return drm_gpuvm_resv(&vm->gpuvm);
> }
>
>+void xe_vm_kill(struct xe_vm *vm, bool unlocked);
>+
> /**
> * xe_vm_assert_held(vm) - Assert that the vm's reservation object is held.
> * @vm: The vm
>--
Changes in xe_vm.c and .h are not needed.
Given that is taken care of,
Reviewed-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
Niranjana
>2.34.1
>
More information about the Intel-xe
mailing list