[Intel-xe] [PATCH v2] RFC: drm/xe: Return correct error code for xe_wait_user_fence_ioctl

Matthew Brost matthew.brost at intel.com
Wed Nov 29 09:11:28 UTC 2023


On Tue, Nov 28, 2023 at 04:25:14PM -0500, Rodrigo Vivi wrote:
> On Wed, Nov 22, 2023 at 09:51:42AM +0530, Bommu Krishnaiah wrote:
> > return correct error code if exec_queue is reset/engine is hung
> > remove the num_engines/instances members from drm_xe_wait_user_fence structure
> > and add a exec_queue_id member
> > 
> > v2: Addressed the review comments
> > 
> > Need to validated the changes
> > 
> > Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu at intel.com>
> > Cc: Oak Zeng <oak.zeng at intel.com>
> > Cc: Kempczynski Zbigniew <Zbigniew.Kempczynski at intel.com>
> > Cc: Matthew Brost <matthew.brost at intel.com>
> 
> Cc: Francois Dugast <francois.dugast at intel.com>
> 
> This patch here needs to replace my patch in Francois' series:
> https://lore.kernel.org/all/20231122143833.7-12-francois.dugast@intel.com/
> 
> I finally saw some sense in some engine information in there.
> 
> Brost, thoughts?
> 

I think the uAPI change here makes sense, an exec_queue_id is included
so if that exec_queue is killed one way or another any waiters on
ufences using the exec_queue are woken.

Matt

> > ---
> >  drivers/gpu/drm/xe/xe_exec_queue_types.h |  2 +
> >  drivers/gpu/drm/xe/xe_execlist.c         |  7 +++
> >  drivers/gpu/drm/xe/xe_guc_submit.c       | 13 +++++
> >  drivers/gpu/drm/xe/xe_wait_user_fence.c  | 60 +++++-------------------
> >  include/uapi/drm/xe_drm.h                | 13 +----
> >  5 files changed, 37 insertions(+), 58 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > index 5ba47a5cfdbd..84ccf7242247 100644
> > --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> > @@ -206,6 +206,8 @@ struct xe_exec_queue_ops {
> >  	 * signalled when this function is called.
> >  	 */
> >  	void (*resume)(struct xe_exec_queue *q);
> > +	/** @reset_status: check exec queue reset status */
> > +	bool (*reset_status)(struct xe_exec_queue *q);
> >  };
> >  
> >  #endif
> > diff --git a/drivers/gpu/drm/xe/xe_execlist.c b/drivers/gpu/drm/xe/xe_execlist.c
> > index e8754adfc52a..3b390f047108 100644
> > --- a/drivers/gpu/drm/xe/xe_execlist.c
> > +++ b/drivers/gpu/drm/xe/xe_execlist.c
> > @@ -446,6 +446,12 @@ static void execlist_exec_queue_resume(struct xe_exec_queue *q)
> >  	/* NIY */
> >  }
> >  
> > +static bool execlist_exec_queue_reset_status(struct xe_exec_queue *q)
> > +{
> > +	 /* NIY */
> > +	 return false;
> > +}
> > +
> >  static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
> >  	.init = execlist_exec_queue_init,
> >  	.kill = execlist_exec_queue_kill,
> > @@ -457,6 +463,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
> >  	.suspend = execlist_exec_queue_suspend,
> >  	.suspend_wait = execlist_exec_queue_suspend_wait,
> >  	.resume = execlist_exec_queue_resume,
> > +	.reset_status = execlist_exec_queue_reset_status,
> >  };
> >  
> >  int xe_execlist_init(struct xe_gt *gt)
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 9e9e925c7353..e13792e49a67 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -852,6 +852,10 @@ static void simple_error_capture(struct xe_exec_queue *q)
> >  static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
> >  {
> >  	struct xe_guc *guc = exec_queue_to_guc(q);
> > +	struct xe_device *xe = guc_to_xe(guc);
> > +
> > +	/** to wakeup xe_wait_user_fence ioctl if exec queue is reset */
> > +	wake_up_all(&xe->ufence_wq);
> >  
> >  	if (xe_exec_queue_is_lr(q))
> >  		queue_work(guc_to_gt(guc)->ordered_wq, &q->guc->lr_tdr);
> > @@ -1392,6 +1396,14 @@ static void guc_exec_queue_resume(struct xe_exec_queue *q)
> >  	guc_exec_queue_add_msg(q, msg, RESUME);
> >  }
> >  
> > +static bool guc_exec_queue_reset_status(struct xe_exec_queue *q)
> > +{
> > +	if (exec_queue_registered(q))
> > +		return exec_queue_reset(q);
> 
> probably just rename the original function and use that directly.
> 
> > +
> > +	return false;
> > +}
> > +
> >  /*
> >   * All of these functions are an abstraction layer which other parts of XE can
> >   * use to trap into the GuC backend. All of these functions, aside from init,
> > @@ -1409,6 +1421,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
> >  	.suspend = guc_exec_queue_suspend,
> >  	.suspend_wait = guc_exec_queue_suspend_wait,
> >  	.resume = guc_exec_queue_resume,
> > +	.reset_status = guc_exec_queue_reset_status,
> >  };
> >  
> >  static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
> > diff --git a/drivers/gpu/drm/xe/xe_wait_user_fence.c b/drivers/gpu/drm/xe/xe_wait_user_fence.c
> > index 4d5c2555ce41..97af879ee923 100644
> > --- a/drivers/gpu/drm/xe/xe_wait_user_fence.c
> > +++ b/drivers/gpu/drm/xe/xe_wait_user_fence.c
> > @@ -13,6 +13,7 @@
> >  #include "xe_device.h"
> >  #include "xe_gt.h"
> >  #include "xe_macros.h"
> > +#include "xe_exec_queue.h"
> >  
> >  static int do_compare(u64 addr, u64 value, u64 mask, u16 op)
> >  {
> > @@ -58,27 +59,6 @@ static const enum xe_engine_class user_to_xe_engine_class[] = {
> >  	[DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
> >  };
> >  
> > -static int check_hw_engines(struct xe_device *xe,
> > -			    struct drm_xe_engine_class_instance *eci,
> > -			    int num_engines)
> > -{
> > -	int i;
> > -
> > -	for (i = 0; i < num_engines; ++i) {
> > -		enum xe_engine_class user_class =
> > -			user_to_xe_engine_class[eci[i].engine_class];
> > -
> > -		if (eci[i].gt_id >= xe->info.tile_count)
> > -			return -EINVAL;
> > -
> > -		if (!xe_gt_hw_engine(xe_device_get_gt(xe, eci[i].gt_id),
> > -				     user_class, eci[i].engine_instance, true))
> > -			return -EINVAL;
> > -	}
> > -
> > -	return 0;
> > -}
> > -
> >  #define VALID_FLAGS	(DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP | \
> >  			 DRM_XE_UFENCE_WAIT_FLAG_ABSTIME)
> >  #define MAX_OP		DRM_XE_UFENCE_WAIT_OP_LTE
> > @@ -130,14 +110,12 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev, void *data,
> >  			     struct drm_file *file)
> >  {
> >  	struct xe_device *xe = to_xe_device(dev);
> > +	struct xe_file *xef = to_xe_file(file);
> >  	DEFINE_WAIT_FUNC(w_wait, woken_wake_function);
> >  	struct drm_xe_wait_user_fence *args = data;
> > -	struct drm_xe_engine_class_instance eci[XE_HW_ENGINE_MAX_INSTANCE];
> > -	struct drm_xe_engine_class_instance __user *user_eci =
> > -		u64_to_user_ptr(args->instances);
> > +	struct xe_exec_queue *q = NULL;
> >  	u64 addr = args->addr;
> >  	int err;
> > -	bool no_engines = args->flags & DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP;
> >  	long timeout;
> >  	ktime_t start;
> >  
> > @@ -151,35 +129,17 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev, void *data,
> >  	if (XE_IOCTL_DBG(xe, args->op > MAX_OP))
> >  		return -EINVAL;
> >  
> > -	if (XE_IOCTL_DBG(xe, no_engines &&
> > -			 (args->num_engines || args->instances)))
> > -		return -EINVAL;
> > -
> > -	if (XE_IOCTL_DBG(xe, !no_engines && !args->num_engines))
> > -		return -EINVAL;
> > -
> >  	if (XE_IOCTL_DBG(xe, addr & 0x7))
> >  		return -EINVAL;
> >  
> > -	if (XE_IOCTL_DBG(xe, args->num_engines > XE_HW_ENGINE_MAX_INSTANCE))
> > -		return -EINVAL;
> > -
> > -	if (!no_engines) {
> > -		err = copy_from_user(eci, user_eci,
> > -				     sizeof(struct drm_xe_engine_class_instance) *
> > -			     args->num_engines);
> > -		if (XE_IOCTL_DBG(xe, err))
> > -			return -EFAULT;
> > -
> > -		if (XE_IOCTL_DBG(xe, check_hw_engines(xe, eci,
> > -						      args->num_engines)))
> > -			return -EINVAL;
> > -	}
> > -
> >  	timeout = to_jiffies_timeout(xe, args);
> >  
> >  	start = ktime_get();
> >  
> > +	q = xe_exec_queue_lookup(xef, args->exec_queue_id);
> > +	if (XE_IOCTL_DBG(xe, !q))
> > +		return -ENOENT;
> > +
> >  	/*
> >  	 * FIXME: Very simple implementation at the moment, single wait queue
> >  	 * for everything. Could be optimized to have a wait queue for every
> > @@ -202,6 +162,12 @@ int xe_wait_user_fence_ioctl(struct drm_device *dev, void *data,
> >  			break;
> >  		}
> >  
> > +		if (q->ops->reset_status(q)) {
> > +			drm_info(&xe->drm, "exec gueue reset detected\n");
> > +			err = -EIO;
> > +			break;
> > +		}
> > +
> >  		timeout = wait_woken(&w_wait, TASK_INTERRUPTIBLE, timeout);
> >  	}
> >  	remove_wait_queue(&xe->ufence_wq, &w_wait);
> > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> > index 88f3aca02b08..ae2457dc30ad 100644
> > --- a/include/uapi/drm/xe_drm.h
> > +++ b/include/uapi/drm/xe_drm.h
> > @@ -962,17 +962,8 @@ struct drm_xe_wait_user_fence {
> >  	 */
> >  	__s64 timeout;
> >  
> > -	/**
> > -	 * @num_engines: number of engine instances to wait on, must be zero
> > -	 * when DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP set
> > -	 */
> > -	__u64 num_engines;
> > -
> > -	/**
> > -	 * @instances: user pointer to array of drm_xe_engine_class_instance to
> > -	 * wait on, must be NULL when DRM_XE_UFENCE_WAIT_FLAG_SOFT_OP set
> > -	 */
> > -	__u64 instances;
> > +	/** @exec_queue_id: exec_queue_id returned from xe_exec_queue_create_ioctl */
> > +	__u32 exec_queue_id;
> >  
> >  	/** @reserved: Reserved */
> >  	__u64 reserved[2];
> > -- 
> > 2.25.1
> > 


More information about the Intel-xe mailing list