[Intel-xe] [PATCH] drm/xe: Ban a VM if rebind worker hits an error
Matthew Brost
matthew.brost at intel.com
Sun May 21 18:21:08 UTC 2023
We cannot recover a VM if a rebind worker hits an error, ban the VM if
happens to ensure we do not attempt to place this VM on the hardware
again.
A follow up will inform the user if this happens.
Signed-off-by: Matthew Brost <matthew.brost at intel.com>
---
drivers/gpu/drm/xe/xe_engine.c | 5 +++++
drivers/gpu/drm/xe/xe_exec.c | 2 +-
drivers/gpu/drm/xe/xe_trace.h | 5 +++++
drivers/gpu/drm/xe/xe_vm.c | 27 +++++++++++++++++++++++----
drivers/gpu/drm/xe/xe_vm.h | 10 ++++++++++
drivers/gpu/drm/xe/xe_vm_madvise.c | 2 +-
drivers/gpu/drm/xe/xe_vm_types.h | 5 +++--
7 files changed, 48 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_engine.c b/drivers/gpu/drm/xe/xe_engine.c
index 094ec17d3004..06c60e11cd85 100644
--- a/drivers/gpu/drm/xe/xe_engine.c
+++ b/drivers/gpu/drm/xe/xe_engine.c
@@ -594,6 +594,11 @@ int xe_engine_create_ioctl(struct drm_device *dev, void *data,
if (XE_IOCTL_ERR(xe, !vm))
return -ENOENT;
+ if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
+ xe_vm_put(vm);
+ return -ENOENT;
+ }
+
e = xe_engine_create(xe, vm, logical_mask,
args->width, hwe, ENGINE_FLAG_PERSISTENT);
xe_vm_put(vm);
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 3db1b159586e..c36b30696fad 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -292,7 +292,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
if (err)
goto err_unlock_list;
- if (xe_vm_is_closed(engine->vm)) {
+ if (xe_vm_is_closed_or_banned(engine->vm)) {
drm_warn(&xe->drm, "Trying to schedule after vm is closed\n");
err = -EIO;
goto err_engine_end;
diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
index 2f8eb7ebe9a7..eb89ac934394 100644
--- a/drivers/gpu/drm/xe/xe_trace.h
+++ b/drivers/gpu/drm/xe/xe_trace.h
@@ -472,6 +472,11 @@ DECLARE_EVENT_CLASS(xe_vm,
__entry->asid)
);
+DEFINE_EVENT(xe_vm, xe_vm_kill,
+ TP_PROTO(struct xe_vm *vm),
+ TP_ARGS(vm)
+);
+
DEFINE_EVENT(xe_vm, xe_vm_create,
TP_PROTO(struct xe_vm *vm),
TP_ARGS(vm)
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index a0306526b269..94b987a781c7 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -511,6 +511,24 @@ void xe_vm_unlock_dma_resv(struct xe_vm *vm,
#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
+static void xe_vm_kill(struct xe_vm *vm)
+{
+ struct ww_acquire_ctx ww;
+ struct xe_engine *e;
+
+ lockdep_assert_held(&vm->lock);
+
+ xe_vm_lock(vm, &ww, 0, false);
+ vm->flags |= XE_VM_FLAG_BANNED;
+ trace_xe_vm_kill(vm);
+
+ list_for_each_entry(e, &vm->preempt.engines, compute.link)
+ e->ops->kill(e);
+ xe_vm_unlock(vm, &ww);
+
+ /* TODO: Inform user the VM is banned */
+}
+
static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -530,7 +548,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
XE_BUG_ON(!xe_vm_in_compute_mode(vm));
trace_xe_vm_rebind_worker_enter(vm);
- if (xe_vm_is_closed(vm)) {
+ if (xe_vm_is_closed_or_banned(vm)) {
trace_xe_vm_rebind_worker_exit(vm);
return;
}
@@ -659,11 +677,12 @@ static void preempt_rebind_work_func(struct work_struct *w)
goto retry;
}
}
+ if (err)
+ xe_vm_kill(vm);
up_write(&vm->lock);
free_preempt_fences(&preempt_fences);
- XE_WARN_ON(err < 0); /* TODO: Kill VM or put in error state */
trace_xe_vm_rebind_worker_exit(vm);
}
@@ -1070,7 +1089,7 @@ xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma)
{
struct rb_node *node;
- if (xe_vm_is_closed(vm))
+ if (xe_vm_is_closed_or_banned(vm))
return NULL;
XE_BUG_ON(vma->end >= vm->size);
@@ -3016,7 +3035,7 @@ int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
goto free_objs;
}
- if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
+ if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
DRM_ERROR("VM closed while we began looking up?\n");
err = -ENOENT;
goto put_vm;
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index 748dc16ebed9..24b05aa5cbc7 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -49,6 +49,16 @@ static inline bool xe_vm_is_closed(struct xe_vm *vm)
return !vm->size;
}
+static inline bool xe_vm_is_banned(struct xe_vm *vm)
+{
+ return vm->flags & XE_VM_FLAG_BANNED;
+}
+
+static inline bool xe_vm_is_closed_or_banned(struct xe_vm *vm)
+{
+ return xe_vm_is_closed(vm) || xe_vm_is_banned(vm);
+}
+
struct xe_vma *
xe_vm_find_overlapping_vma(struct xe_vm *vm, const struct xe_vma *vma);
diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c
index 29815852985a..e29bf13f54f1 100644
--- a/drivers/gpu/drm/xe/xe_vm_madvise.c
+++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
@@ -311,7 +311,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data,
if (XE_IOCTL_ERR(xe, !vm))
return -EINVAL;
- if (XE_IOCTL_ERR(xe, xe_vm_is_closed(vm))) {
+ if (XE_IOCTL_ERR(xe, xe_vm_is_closed_or_banned(vm))) {
err = -ENOENT;
goto put_vm;
}
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index fada7896867f..84cd295e6fd6 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -178,8 +178,9 @@ struct xe_vm {
#define XE_VM_FLAG_MIGRATION BIT(3)
#define XE_VM_FLAG_SCRATCH_PAGE BIT(4)
#define XE_VM_FLAG_FAULT_MODE BIT(5)
-#define XE_VM_FLAG_GT_ID(flags) (((flags) >> 6) & 0x3)
-#define XE_VM_FLAG_SET_GT_ID(gt) ((gt)->info.id << 6)
+#define XE_VM_FLAG_BANNED BIT(6)
+#define XE_VM_FLAG_GT_ID(flags) (((flags) >> 7) & 0x3)
+#define XE_VM_FLAG_SET_GT_ID(gt) ((gt)->info.id << 7)
unsigned long flags;
/** @composite_fence_ctx: context composite fence */
--
2.34.1
More information about the Intel-xe
mailing list