[PATCH 5/5] drm/xe: Allow suspend / resume to be safely called multiple times

Fri Aug 9 21:02:06 UTC 2024

-----Original Message-----
From: Intel-xe <intel-xe-bounces at lists.freedesktop.org> On Behalf Of Matthew Brost
Sent: Friday, August 9, 2024 12:19 PM
To: intel-xe at lists.freedesktop.org
Cc: Dugast, Francois <francois.dugast at intel.com>
Subject: [PATCH 5/5] drm/xe: Allow suspend / resume to be safely called multiple times
> 
> Switching modes between LR and dma-fence can result in multiple calls to
> suspend / resume. Make these calls safe while still enforcing call
> order.
> 
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_guc_submit.c | 40 ++++++++++++++++++++++++------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> index a9af33ff8aa7..c5a824981362 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -1388,6 +1388,8 @@ static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
>  #define SET_SCHED_PROPS	2
>  #define SUSPEND		3
>  #define RESUME		4
> +#define OPCODE_MASK	0xf
> +#define MSG_LOCKED	(1 << 8)
>  
>  static void guc_exec_queue_process_msg(struct xe_sched_msg *msg)
>  {
> @@ -1432,7 +1434,7 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
>  	struct xe_device *xe = guc_to_xe(guc);
>  	struct xe_guc_exec_queue *ge;
>  	long timeout;
> -	int err;
> +	int err, i;
>  
>  	xe_assert(xe, xe_device_uc_enabled(guc_to_xe(guc)));
>  
> @@ -1444,6 +1446,9 @@ static int guc_exec_queue_init(struct xe_exec_queue *q)
>  	ge->q = q;
>  	init_waitqueue_head(&ge->suspend_wait);
>  
> +	for (i = 0; i < MAX_STATIC_MSG_TYPE; ++i)
> +		INIT_LIST_HEAD(&ge->static_msgs[i].link);
> +
>  	timeout = (q->vm && xe_vm_in_lr_mode(q->vm)) ? MAX_SCHEDULE_TIMEOUT :
>  		  msecs_to_jiffies(q->sched_props.job_timeout_ms);
>  	err = xe_sched_init(&ge->sched, &drm_sched_ops, &xe_sched_ops,
> @@ -1506,11 +1511,26 @@ static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg
>  	xe_pm_runtime_get_noresume(guc_to_xe(exec_queue_to_guc(q)));
>  
>  	INIT_LIST_HEAD(&msg->link);
> -	msg->opcode = opcode;
> +	msg->opcode = opcode & OPCODE_MASK;
>  	msg->private_data = q;
>  
>  	trace_xe_sched_msg_add(msg);
> -	xe_sched_add_msg(&q->guc->sched, msg);
> +	if (opcode & MSG_LOCKED)
> +		xe_sched_add_msg_locked(&q->guc->sched, msg);
> +	else
> +		xe_sched_add_msg(&q->guc->sched, msg);
> +}

It seems like we only and always call this guc_exec_queue_add_msg function with
the MSG_LOCKED opcode set in the suspend and resume cases.  On paper, that
means we could probably just check if the opcode >= 2 and cover all the current
cases.  However, using the new MSG_LOCKED flag in the opcode is definitely more
scalable, so this is less a suggestion and more an observation.

No changes needed, as far as I can tell.
Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
-Jonathan Cavitt

> +
> +static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q,
> +				       struct xe_sched_msg *msg,
> +				       u32 opcode)
> +{
> +	if (!list_empty(&msg->link))
> +		return false;
> +
> +	guc_exec_queue_add_msg(q, msg, opcode | MSG_LOCKED);
> +
> +	return true;
>  }
>  
>  #define STATIC_MSG_CLEANUP	0
> @@ -1584,13 +1604,16 @@ static int guc_exec_queue_set_preempt_timeout(struct xe_exec_queue *q,
>  
>  static int guc_exec_queue_suspend(struct xe_exec_queue *q)
>  {
> +	struct xe_gpu_scheduler *sched = &q->guc->sched;
>  	struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_SUSPEND;
>  
> -	if (exec_queue_killed_or_banned_or_wedged(q) || q->guc->suspend_pending)
> +	if (exec_queue_killed_or_banned_or_wedged(q))
>  		return -EINVAL;
>  
> -	q->guc->suspend_pending = true;
> -	guc_exec_queue_add_msg(q, msg, SUSPEND);
> +	xe_sched_msg_lock(sched);
> +	if (guc_exec_queue_try_add_msg(q, msg, SUSPEND))
> +		q->guc->suspend_pending = true;
> +	xe_sched_msg_unlock(sched);
>  
>  	return 0;
>  }
> @@ -1624,13 +1647,16 @@ static int guc_exec_queue_suspend_wait(struct xe_exec_queue *q)
>  
>  static void guc_exec_queue_resume(struct xe_exec_queue *q)
>  {
> +	struct xe_gpu_scheduler *sched = &q->guc->sched;
>  	struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_RESUME;
>  	struct xe_guc *guc = exec_queue_to_guc(q);
>  	struct xe_device *xe = guc_to_xe(guc);
>  
>  	xe_assert(xe, !q->guc->suspend_pending);
>  
> -	guc_exec_queue_add_msg(q, msg, RESUME);
> +	xe_sched_msg_lock(sched);
> +	guc_exec_queue_try_add_msg(q, msg, RESUME);
> +	xe_sched_msg_unlock(sched);
>  }
>  
>  static bool guc_exec_queue_reset_status(struct xe_exec_queue *q)
> -- 
> 2.34.1
> 
>