[PATCH] drm/xe: Kill exec queues synchronously

Matthew Brost matthew.brost at intel.com
Tue Jul 23 15:22:15 UTC 2024


On Tue, Jul 23, 2024 at 08:14:54AM -0700, Matthew Brost wrote:
> Upon kill of exec queue ensure said exec queue is not running on the GPU
> for overall safety by waiting for kill to complete. This includes
> possibly waiting for a GT reset to complete.
> 
> Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>

Ignore this patch, idea is right but implementation is wrong. Need to
rethink this.

Matt

> ---
>  drivers/gpu/drm/xe/xe_exec_queue_types.h |  6 +++++-
>  drivers/gpu/drm/xe/xe_guc_submit.c       | 13 +++++++++++++
>  2 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> index 1408b02eea53..147e9407ce9b 100644
> --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
> +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
> @@ -152,7 +152,11 @@ struct xe_exec_queue {
>  struct xe_exec_queue_ops {
>  	/** @init: Initialize exec queue for submission backend */
>  	int (*init)(struct xe_exec_queue *q);
> -	/** @kill: Kill inflight submissions for backend */
> +	/**
> +	 * @kill: Kill inflight submissions for backend, wait synchronously for
> +	 * kill. For safety, should never be called while holding any locks as a
> +	 * device reset may be last resort for kill.
> +	 */
>  	void (*kill)(struct xe_exec_queue *q);
>  	/** @fini: Fini exec queue for submission backend */
>  	void (*fini)(struct xe_exec_queue *q);
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index da2ead86b9ae..df03fdb83dbd 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1496,10 +1496,23 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
>  
>  static void guc_exec_queue_kill(struct xe_exec_queue *q)
>  {
> +	struct xe_guc *guc = exec_queue_to_guc(q);
> +	struct xe_device *xe = guc_to_xe(guc);
> +	int ret;
> +
>  	trace_xe_exec_queue_kill(q);
>  	set_exec_queue_killed(q);
>  	__suspend_fence_signal(q);
>  	xe_guc_exec_queue_trigger_cleanup(q);
> +
> +	ret = wait_event_timeout(guc->ct.wq,
> +				 !exec_queue_pending_disable(q) ||
> +				 guc_read_stopped(guc), HZ * 5);
> +	if (!ret) {
> +		drm_warn(&xe->drm, "Schedule disable failed to respond upon kill");
> +		xe_gt_reset_async(q->gt);
> +	}
> +	xe_guc_submit_reset_wait(guc);
>  }
>  
>  static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg *msg,
> -- 
> 2.34.1
> 


More information about the Intel-xe mailing list