[Intel-xe] [PATCH v2] RFC: drm/xe: Return correct error code for xe_wait_user_fence_ioctl
Zeng, Oak
oak.zeng at intel.com
Fri Nov 24 18:18:04 UTC 2023
This version seems good to me. See two small comments inline.
Oak
> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>
> Sent: Tuesday, November 21, 2023 11:22 PM
> To: intel-xe at lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>; Zeng, Oak
> <oak.zeng at intel.com>; Kempczynski, Zbigniew
> <zbigniew.kempczynski at intel.com>; Brost, Matthew
> <matthew.brost at intel.com>
> Subject: [PATCH v2] RFC: drm/xe: Return correct error code for
> xe_wait_user_fence_ioctl
>
> return correct error code if exec_queue is reset/engine is hung
> remove the num_engines/instances members from drm_xe_wait_user_fence
> structure
> and add a exec_queue_id member
You also added a reset_status interface to exec_q_ops, need to mention it in description. Or if you want, I think this internal interface change can be a separate patch in this series. A small patch each only do one simple thing always help people to review.
>
> v2: Addressed the review comments
>
> Need to validated the changes
>
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu at intel.com>
> Cc: Oak Zeng <oak.zeng at intel.com>
> Cc: Kempczynski Zbigniew <Zbigniew.Kempczynski at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> ---
> drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 +
> drivers/gpu/drm/xe/xe_execlist.c | 7 +++
> drivers/gpu/drm/xe/xe_guc_submit.c | 13 +++++
> drivers/gpu/drm/xe/xe_wait_user_fence.c | 60 +++++-------------------
> include/uapi/drm/xe_drm.h | 13 +----
> 5 files changed, 37 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 5ba47a5cfdbd..84ccf7242247 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -206,6 +206,8 @@ struct xe_exec_queue_ops {
> * signalled when this function is called.
> */
> void (*resume)(struct xe_exec_queue *q);
> + /** @reset_status: check exec queue reset status */
> + bool (*reset_status)(struct xe_exec_queue *q);
> };
>
> #endif
> diff --git a/drivers/gpu/drm/xe/xe_execlist.c
> b/drivers/gpu/drm/xe/xe_execlist.c
> index e8754adfc52a..3b390f047108 100644
> --- a/drivers/gpu/drm/xe/xe_execlist.c
> +++ b/drivers/gpu/drm/xe/xe_execlist.c
> @@ -446,6 +446,12 @@ static void execlist_exec_queue_resume(struct
> xe_exec_queue *q)
> /* NIY */
> }
>
> +static bool execlist_exec_queue_reset_status(struct xe_exec_queue *q)
> +{
> + /* NIY */
> + return false;
> +}
> +
> static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
> .init = execlist_exec_queue_init,
> .kill = execlist_exec_queue_kill,
> @@ -457,6 +463,7 @@ static const struct xe_exec_queue_ops
> execlist_exec_queue_ops = {
> .suspend = execlist_exec_queue_suspend,
> .suspend_wait = execlist_exec_queue_suspend_wait,
> .resume = execlist_exec_queue_resume,
> + .reset_status = execlist_exec_queue_reset_status,
> };
>
> int xe_execlist_init(struct xe_gt *gt)
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 9e9e925c7353..e13792e49a67 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -852,6 +852,10 @@ static void simple_error_capture(struct xe_exec_queue
> *q)
> static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
> {
> struct xe_guc *guc = exec_queue_to_guc(q);
> + struct xe_device *xe = guc_to_xe(guc);
> +
> + /** to wakeup xe_wait_user_fence ioctl if exec queue is reset */
> + wake_up_all(&xe->ufence_wq);
>
> if (xe_exec_queue_is_lr(q))
> queue_work(guc_to_gt(guc)->ordered_wq, &q->guc->lr_tdr);
> @@ -1392,6 +1396,14 @@ static void guc_exec_queue_resume(struct
> xe_exec_queue *q)
> guc_exec_queue_add_msg(q, msg, RESUME);
> }
>
> +static bool guc_exec_queue_reset_status(struct xe_exec_queue *q)
> +{
> + if (exec_queue_registered(q))
Do you really need this check? If a queue is not registered yet, how can it be reset?
Oak
> + return exec_queue_reset(q);
> +
> + return false;
> +}
> +
> /*
> * All of these functions are an abstraction layer which other parts of XE can
> * use to trap into the GuC backend. All of these functions, aside from init,
> @@ -1409,6 +1421,7 @@ static const struct xe_exec_queue_ops
> guc_exec_queue_ops = {
> .suspend = guc_exec_queue_suspend,
> .suspend_wait = guc_exec_queue_suspend_wait,
> .resume = guc_exec_queue_resume,
> + .reset_status = guc_exec_queue_reset_status,
> };
>
> static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> diff --git a/drivers/gpu/drm/xe/xe_wait_user_fence.c
> b/drivers/gpu/drm/xe/xe_wait_user_fence.c
> index 4d5c2555ce41..97af879ee923 100644
> --- a/drivers/gpu/drm/xe/xe_wait_user_fence.c
> +++ b/drivers/gpu/drm/xe/xe_wait_user_fence.c
> @@ -13,6 +13,7 @@
> #include "xe_device.h"
> #include "xe_gt.h"
> #include "xe_macros.h"
> +#include "xe_exec_queue.h"
>
> static int do_compare(u64 addr, u64 value, u64 mask, u16 op)
> {
> @@ -58,27 +59,6 @@ static const enum xe_engine_class
> user_to_xe_engine_class[] = {
> [DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
> };
>
> -static int check_hw_engines(struct xe_device *xe,
> - struct drm_xe_engine_class_instance *eci,
> - int num_engines)
> -{
> - int i;
> -
> - for (i = 0; i < num_engines; ++i) {
> - enum xe_engine_class user_class =
> - user_to_xe_engine_class[eci[i].engine_class];
> -
> - if (eci[i].gt_id >= xe->info.tile_count)
> - return -EINVAL;
> -
> - if (!xe_gt_hw_engine(xe_device_get_gt(xe, eci[i].gt_id),
> - user_class, eci[i].engine_instance, true))
> - return -EINVAL;
> - }
> -
> - return 0;
> -}
> -
> #define VALID_FLAGS (DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP | \
> DRM_XE_UFENCE_WAIT_FLAG_ABSTIME)
> #define MAX_OP DRM_XE_UFENCE_WAIT_OP_LTE
> @@ -130,14 +110,12 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev,
> void *data,
> struct drm_file *file)
> {
> struct xe_device *xe = to_xe_device(dev);
> + struct xe_file *xef = to_xe_file(file);
> DEFINE_WAIT_FUNC(w_wait, woken_wake_function);
> struct drm_xe_wait_user_fence *args = data;
> - struct drm_xe_engine_class_instance
> eci[XE_HW_ENGINE_MAX_INSTANCE];
> - struct drm_xe_engine_class_instance __user *user_eci =
> - u64_to_user_ptr(args->instances);
> + struct xe_exec_queue *q = NULL;
> u64 addr = args->addr;
> int err;
> - bool no_engines = args->flags &
> DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP;
> long timeout;
> ktime_t start;
>
> @@ -151,35 +129,17 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev,
> void *data,
> if (XE_IOCTL_DBG(xe, args->op > MAX_OP))
> return -EINVAL;
>
> - if (XE_IOCTL_DBG(xe, no_engines &&
> - (args->num_engines || args->instances)))
> - return -EINVAL;
> -
> - if (XE_IOCTL_DBG(xe, !no_engines && !args->num_engines))
> - return -EINVAL;
> -
> if (XE_IOCTL_DBG(xe, addr & 0x7))
> return -EINVAL;
>
> - if (XE_IOCTL_DBG(xe, args->num_engines >
> XE_HW_ENGINE_MAX_INSTANCE))
> - return -EINVAL;
> -
> - if (!no_engines) {
> - err = copy_from_user(eci, user_eci,
> - sizeof(struct drm_xe_engine_class_instance) *
> - args->num_engines);
> - if (XE_IOCTL_DBG(xe, err))
> - return -EFAULT;
> -
> - if (XE_IOCTL_DBG(xe, check_hw_engines(xe, eci,
> - args->num_engines)))
> - return -EINVAL;
> - }
> -
> timeout = to_jiffies_timeout(xe, args);
>
> start = ktime_get();
>
> + q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> + if (XE_IOCTL_DBG(xe, !q))
> + return -ENOENT;
> +
> /*
> * FIXME: Very simple implementation at the moment, single wait queue
> * for everything. Could be optimized to have a wait queue for every
> @@ -202,6 +162,12 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev,
> void *data,
> break;
> }
>
> + if (q->ops->reset_status(q)) {
> + drm_info(&xe->drm, "exec gueue reset detected\n");
> + err = -EIO;
> + break;
> + }
> +
> timeout = wait_woken(&w_wait, TASK_INTERRUPTIBLE, timeout);
> }
> remove_wait_queue(&xe->ufence_wq, &w_wait);
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index 88f3aca02b08..ae2457dc30ad 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -962,17 +962,8 @@ struct drm_xe_wait_user_fence {
> */
> __s64 timeout;
>
> - /**
> - * @num_engines: number of engine instances to wait on, must be zero
> - * when DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP set
> - */
> - __u64 num_engines;
> -
> - /**
> - * @instances: user pointer to array of drm_xe_engine_class_instance to
> - * wait on, must be NULL when DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP
> set
> - */
> - __u64 instances;
> + /** @exec_queue_id: exec_queue_id returned from
> xe_exec_queue_create_ioctl */
> + __u32 exec_queue_id;
>
> /** @reserved: Reserved */
> __u64 reserved[2];
> --
> 2.25.1
More information about the Intel-xe
mailing list