[PATCH v3 1/1] drm/xe: Build PM into GuC CT layer

Thu Jul 18 18:00:52 UTC 2024

On Thu, Jul 18, 2024 at 06:52:10PM +0100, Matthew Auld wrote:
> On 18/07/2024 18:34, Matthew Brost wrote:
> > Take PM ref when any G2H are outstanding, drop when none are
> > outstanding.
> > 
> > To safely ensure we have PM ref when in the GuC CT layer, a PM ref needs
> > to be held when scheduler messages are pending too.
> > 
> > v2:
> >   - Add outer PM protections to xe_file_close (CI)
> > v3:
> >   - Only take PM ref 0->1 and drop on 1->0 (Matthew Auld)
> > 
> > Cc: Matthew Auld <matthew.auld at intel.com>
> > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > Cc: Nirmoy Das <nirmoy.das at intel.com>
> > Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> > ---
> >   drivers/gpu/drm/xe/xe_device.c     | 4 ++++
> >   drivers/gpu/drm/xe/xe_guc_ct.c     | 8 +++++++-
> >   drivers/gpu/drm/xe/xe_guc_submit.c | 4 ++++
> >   3 files changed, 15 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> > index 06cebaffb451..b68ab474e1a0 100644
> > --- a/drivers/gpu/drm/xe/xe_device.c
> > +++ b/drivers/gpu/drm/xe/xe_device.c
> > @@ -101,6 +101,8 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file)
> >   	struct xe_exec_queue *q;
> >   	unsigned long idx;
> > +	xe_pm_runtime_get(xe);
> > +
> >   	/*
> >   	 * No need for exec_queue.lock here as there is no contention for it
> >   	 * when FD is closing as IOCTLs presumably can't be modifying the
> > @@ -126,6 +128,8 @@ static void xe_file_close(struct drm_device *dev, struct drm_file *file)
> >   	xe_drm_client_put(xef->client);
> >   	kfree(xef);
> > +
> > +	xe_pm_runtime_put(xe);
> >   }
> >   static const struct drm_ioctl_desc xe_ioctls[] = {
> > diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
> > index 7d2e937da1d8..e1d179fb7f43 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_ct.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_ct.c
> > @@ -327,6 +327,8 @@ static void xe_guc_ct_set_state(struct xe_guc_ct *ct,
> >   	xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 ||
> >   		     state == XE_GUC_CT_STATE_STOPPED);
> > +	if (ct->g2h_outstanding)
> > +		xe_pm_runtime_put(ct_to_xe(ct));
> >   	ct->g2h_outstanding = 0;
> >   	ct->state = state;
> > @@ -499,6 +501,9 @@ static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h)
> >   	if (g2h_len) {
> 
> So we can also have guc messages for which we don't expect a response
> (g2h_len=0)? How do we track those? Or maybe guc itself will take care to
> process them as we suspend it?
> 

We don't expect a response when g2h_len == 0, the GuC can still send
unsolicited G2H (e.g. engine reset, page fault, etc...) but to get those
we have jobs running and have PM refs elsewhere for those.

> Anyway,
> Reviewed-by: Matthew Auld <matthew.auld at intel.com>

Thanks. Noticed one other nit, going to add an assert for 'num_g2h' here
to make sure caller of this function know what they doing because if
they don't we will leak PM refs with the way I wrote this patch.

Matt

> 
> >   		lockdep_assert_held(&ct->fast_lock);
> > +		if (!ct->g2h_outstanding)
> > +			xe_pm_runtime_get_noresume(ct_to_xe(ct));
> > +
> >   		ct->ctbs.g2h.info.space -= g2h_len;
> >   		ct->g2h_outstanding += num_g2h;
> >   	}
> > @@ -511,7 +516,8 @@ static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
> >   		     ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space);
> >   	ct->ctbs.g2h.info.space += g2h_len;
> > -	--ct->g2h_outstanding;
> > +	if (!--ct->g2h_outstanding)
> > +		xe_pm_runtime_put(ct_to_xe(ct));
> >   }
> >   static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
> > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
> > index 860405527115..993d0344dc88 100644
> > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > @@ -1402,6 +1402,8 @@ static void guc_exec_queue_process_msg(struct xe_sched_msg *msg)
> >   	default:
> >   		XE_WARN_ON("Unknown message type");
> >   	}
> > +
> > +	xe_pm_runtime_put(guc_to_xe(exec_queue_to_guc(msg->private_data)));
> >   }
> >   static const struct drm_sched_backend_ops drm_sched_ops = {
> > @@ -1492,6 +1494,8 @@ static void guc_exec_queue_kill(struct xe_exec_queue *q)
> >   static void guc_exec_queue_add_msg(struct xe_exec_queue *q, struct xe_sched_msg *msg,
> >   				   u32 opcode)
> >   {
> > +	xe_pm_runtime_get_noresume(guc_to_xe(exec_queue_to_guc(q)));
> > +
> >   	INIT_LIST_HEAD(&msg->link);
> >   	msg->opcode = opcode;
> >   	msg->private_data = q;