[Intel-xe] [PATCH] RFC: drm/xe: Return correct error code for xe_wait_user_fence_ioctl

Fri Nov 17 18:19:45 UTC 2023

> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>
> Sent: Thursday, November 16, 2023 3:08 PM
> To: intel-xe at lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>; Zeng, Oak
> <oak.zeng at intel.com>; Kempczynski, Zbigniew
> <zbigniew.kempczynski at intel.com>
> Subject: [PATCH] RFC: drm/xe: Return correct error code for
> xe_wait_user_fence_ioctl
> 
> return correct error code if exec_queue is reset/engine is hung
> remove the num_engines/instances members from drm_xe_wait_user_fence
> structure
> and add a exec_queue_id member
> 
> Need to validated the changes
> 
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu at intel.com>
> Cc: Oak Zeng <oak.zeng at intel.com>
> Cc: Kempczynski Zbigniew <Zbigniew.Kempczynski at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_guc_submit.c      | 24 ++++------
>  drivers/gpu/drm/xe/xe_guc_submit.h      | 20 ++++++++
>  drivers/gpu/drm/xe/xe_wait_user_fence.c | 64 +++++++------------------
>  include/uapi/drm/xe_drm.h               | 16 ++-----
>  4 files changed, 50 insertions(+), 74 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 9e9e925c7353..de2d2f7303d5 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -42,21 +42,7 @@ exec_queue_to_guc(struct xe_exec_queue *q)
>  	return &q->gt->uc.guc;
>  }
> 
> -/*
> - * Helpers for engine state, using an atomic as some of the bits can transition
> - * as the same time (e.g. a suspend can be happning at the same time as
> schedule
> - * engine done being processed).
> - */
> -#define EXEC_QUEUE_STATE_REGISTERED		(1 << 0)
> -#define ENGINE_STATE_ENABLED		(1 << 1)
> -#define EXEC_QUEUE_STATE_PENDING_ENABLE	(1 << 2)
> -#define EXEC_QUEUE_STATE_PENDING_DISABLE	(1 << 3)
> -#define EXEC_QUEUE_STATE_DESTROYED		(1 << 4)
> -#define ENGINE_STATE_SUSPENDED		(1 << 5)
> -#define EXEC_QUEUE_STATE_RESET		(1 << 6)
> -#define ENGINE_STATE_KILLED		(1 << 7)

Why do you need to move those defines to .h file? I don't see those are used out side of xe_guc_submit.c

> -
> -static bool exec_queue_registered(struct xe_exec_queue *q)
> +bool exec_queue_registered(struct xe_exec_queue *q)
>  {
>  	return atomic_read(&q->guc->state) &
> EXEC_QUEUE_STATE_REGISTERED;
>  }
> @@ -151,7 +137,7 @@ static void clear_exec_queue_suspended(struct
> xe_exec_queue *q)
>  	atomic_and(~ENGINE_STATE_SUSPENDED, &q->guc->state);
>  }
> 
> -static bool exec_queue_reset(struct xe_exec_queue *q)
> +bool exec_queue_reset(struct xe_exec_queue *q)
>  {
>  	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_RESET;
>  }
> @@ -1681,6 +1667,9 @@ int xe_guc_exec_queue_reset_handler(struct xe_guc
> *guc, u32 *msg, u32 len)
>  	if (!exec_queue_banned(q))
>  		xe_guc_exec_queue_trigger_cleanup(q);
> 
> +	/* to wakeup xe_wait_user_fence ioctl if exec queue is reset */
> +	wake_up_all(&xe->ufence_wq);
> +
>  	return 0;
>  }
> 
> @@ -1708,6 +1697,9 @@ int
> xe_guc_exec_queue_memory_cat_error_handler(struct xe_guc *guc, u32 *msg,
>  	if (!exec_queue_banned(q))
>  		xe_guc_exec_queue_trigger_cleanup(q);
> 
> +	/* to wakeup xe_wait_user_fence ioctl if exec queue is reset */
> +	wake_up_all(&xe->ufence_wq);
> +
>  	return 0;
>  }
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h
> b/drivers/gpu/drm/xe/xe_guc_submit.h
> index fc97869c5b86..4a0566e7e9f7 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.h
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.h
> @@ -8,12 +8,32 @@
> 
>  #include <linux/types.h>
> 
> +#include "xe_exec_queue_types.h"
> +#include "xe_guc_exec_queue_types.h"
> +
> +/*
> + * Helpers for engine state, using an atomic as some of the bits can transition
> + * as the same time (e.g. a suspend can be happning at the same time as
> schedule
> + * engine done being processed).
> + */
> +#define EXEC_QUEUE_STATE_REGISTERED	(1 << 0)
> +#define ENGINE_STATE_ENABLED		(1 << 1)
> +#define EXEC_QUEUE_STATE_PENDING_ENABLE	(1 << 2)
> +#define EXEC_QUEUE_STATE_PENDING_DISABLE	(1 << 3)
> +#define EXEC_QUEUE_STATE_DESTROYED	(1 << 4)
> +#define ENGINE_STATE_SUSPENDED		(1 << 5)
> +#define EXEC_QUEUE_STATE_RESET		(1 << 6)
> +#define ENGINE_STATE_KILLED		(1 << 7)
> +
>  struct drm_printer;
>  struct xe_exec_queue;
>  struct xe_guc;
> 
>  int xe_guc_submit_init(struct xe_guc *guc);
> 
> +bool exec_queue_reset(struct xe_exec_queue *q);
> +bool exec_queue_registered(struct xe_exec_queue *q);
> +
>  int xe_guc_submit_reset_prepare(struct xe_guc *guc);
>  void xe_guc_submit_reset_wait(struct xe_guc *guc);
>  int xe_guc_submit_stop(struct xe_guc *guc);
> diff --git a/drivers/gpu/drm/xe/xe_wait_user_fence.c
> b/drivers/gpu/drm/xe/xe_wait_user_fence.c
> index 78686908f7fb..5ee1062a4623 100644
> --- a/drivers/gpu/drm/xe/xe_wait_user_fence.c
> +++ b/drivers/gpu/drm/xe/xe_wait_user_fence.c
> @@ -13,6 +13,10 @@
>  #include "xe_device.h"
>  #include "xe_gt.h"
>  #include "xe_macros.h"
> +#include "xe_guc_submit.h"
> +#include "xe_exec_queue.h"
> +#include "xe_exec_queue_types.h"
> +#include "xe_guc_exec_queue_types.h"
> 
>  static int do_compare(u64 addr, u64 value, u64 mask, u16 op)
>  {
> @@ -58,27 +62,6 @@ static const enum xe_engine_class
> user_to_xe_engine_class[] = {
>  	[DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
>  };
> 
> -static int check_hw_engines(struct xe_device *xe,
> -			    struct drm_xe_engine_class_instance *eci,
> -			    int num_engines)
> -{
> -	int i;
> -
> -	for (i = 0; i < num_engines; ++i) {
> -		enum xe_engine_class user_class =
> -			user_to_xe_engine_class[eci[i].engine_class];
> -
> -		if (eci[i].gt_id >= xe->info.tile_count)
> -			return -EINVAL;
> -
> -		if (!xe_gt_hw_engine(xe_device_get_gt(xe, eci[i].gt_id),
> -				     user_class, eci[i].engine_instance, true))
> -			return -EINVAL;
> -	}
> -
> -	return 0;
> -}
> -
>  #define VALID_FLAGS	(DRM_XE_UFENCE_WAIT_SOFT_OP | \
>  			 DRM_XE_UFENCE_WAIT_ABSTIME)
>  #define MAX_OP		DRM_XE_UFENCE_WAIT_LTE
> @@ -130,14 +113,12 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev,
> void *data,
>  			     struct drm_file *file)
>  {
>  	struct xe_device *xe = to_xe_device(dev);
> +	struct xe_file *xef = to_xe_file(file);
>  	DEFINE_WAIT_FUNC(w_wait, woken_wake_function);
>  	struct drm_xe_wait_user_fence *args = data;
> -	struct drm_xe_engine_class_instance
> eci[XE_HW_ENGINE_MAX_INSTANCE];
> -	struct drm_xe_engine_class_instance __user *user_eci =
> -		u64_to_user_ptr(args->instances);
> +	struct xe_exec_queue *q = NULL;
>  	u64 addr = args->addr;
>  	int err;
> -	bool no_engines = args->flags & DRM_XE_UFENCE_WAIT_SOFT_OP;
>  	long timeout;
>  	ktime_t start;
> 
> @@ -151,35 +132,17 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev,
> void *data,
>  	if (XE_IOCTL_DBG(xe, args->op > MAX_OP))
>  		return -EINVAL;
> 
> -	if (XE_IOCTL_DBG(xe, no_engines &&
> -			 (args->num_engines || args->instances)))
> -		return -EINVAL;
> -
> -	if (XE_IOCTL_DBG(xe, !no_engines && !args->num_engines))
> -		return -EINVAL;
> -
>  	if (XE_IOCTL_DBG(xe, addr & 0x7))
>  		return -EINVAL;
> 
> -	if (XE_IOCTL_DBG(xe, args->num_engines >
> XE_HW_ENGINE_MAX_INSTANCE))
> -		return -EINVAL;
> -
> -	if (!no_engines) {
> -		err = copy_from_user(eci, user_eci,
> -				     sizeof(struct drm_xe_engine_class_instance) *
> -			     args->num_engines);
> -		if (XE_IOCTL_DBG(xe, err))
> -			return -EFAULT;
> -
> -		if (XE_IOCTL_DBG(xe, check_hw_engines(xe, eci,
> -						      args->num_engines)))
> -			return -EINVAL;
> -	}
> -
>  	timeout = to_jiffies_timeout(xe, args);
> 
>  	start = ktime_get();
> 
> +	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> +	if (XE_IOCTL_DBG(xe, !q))
> +		return -ENOENT;
> +
>  	/*
>  	 * FIXME: Very simple implementation at the moment, single wait queue
>  	 * for everything. Could be optimized to have a wait queue for every
> @@ -203,6 +166,13 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev,
> void *data,
>  		}
> 
>  		timeout = wait_woken(&w_wait, TASK_INTERRUPTIBLE, timeout);
> +
> +		if (exec_queue_registered(q)) {
> +			if (exec_queue_reset(q)) {
> +				drm_info(&xe->drm, "exec gueue reset
> detected\n");
> +				err = -EIO;

You also need break out of the loop here

And I think it is better to check reset before wait_woken, that way, even if the queue is reset *before* you wait, we can also detect.

> +			}
> +		}
>  	}
>  	remove_wait_queue(&xe->ufence_wq, &w_wait);
> 
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index e007dbefd627..0bcd7914db36 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -132,8 +132,7 @@ struct drm_xe_engine_class_instance {
>  #define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
>  #define DRM_XE_ENGINE_CLASS_COMPUTE		4
>  	/*
> -	 * Kernel only classes (not actual hardware engine class). Used for
> -	 * creating ordered queues of VM bind operations.
> +	 * Used for creating ordered queues of VM bind operations.

This comment change can be in a separate patch.
>  	 */
>  #define DRM_XE_ENGINE_CLASS_VM_BIND_ASYNC	5
>  #define DRM_XE_ENGINE_CLASS_VM_BIND_SYNC	6
> @@ -955,16 +954,11 @@ struct drm_xe_wait_user_fence {
>  	__s64 timeout;
> 
>  	/**
> -	 * @num_engines: number of engine instances to wait on, must be zero
> -	 * when DRM_XE_UFENCE_WAIT_SOFT_OP set
> -	 */
> -	__u64 num_engines;
> -
> -	/**
> -	 * @instances: user pointer to array of drm_xe_engine_class_instance to
> -	 * wait on, must be NULL when DRM_XE_UFENCE_WAIT_SOFT_OP set
> +	 * @exec_queue_id: exec_queue_id, must be of class
> DRM_XE_ENGINE_CLASS_VM_BIND

I don't think this is true. The wait_user_fence api can be used to wait for the completion of a vm_bind operation. It can also be used to wait for a completion of other task submitted through drm_ioctl_xe_exec. In the later case, exec_queue_id can be other class than VM_BIND.

> +	 * and exec queue must have same vm_id. If zero, the default VM bind
> engine
> +	 * is used.

The exec_queue_id here is a return from exec_queue_create api. Xekmd internally should be able to check whether it is a valid queue id. As mentioned above, this api is not only for vm_bind...

Also check: do we need a vm_id parameter for this api? Need @Brost, Matthew's input here. What is the purpose of the vm_id parameter here? I looked wait-user-fence-ioctl implementation, if vm is passed in, we end up using a vm->async_ops.error_capture.wq (vs the xe->ufence_wq). I feel this is more complicated than necessary but I don't really know what is the use case here.

Oak 

>  	 */
> -	__u64 instances;
> +	__u32 exec_queue_id;
> 
>  	/** @reserved: Reserved */
>  	__u64 reserved[2];
> --
> 2.25.1