FIXME Re: [PATCH 13/17] drm/xe/oa: Add OAC support

Dixit, Ashutosh ashutosh.dixit at intel.com
Sat Jan 20 02:52:27 UTC 2024


On Tue, 19 Dec 2023 20:59:29 -0800, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

> On Thu, Dec 07, 2023 at 10:43:25PM -0800, Ashutosh Dixit wrote:
> > Similar to OAR, allow userspace to execute MI_REPORT_PERF_COUNT on compute
> > engines of a specified exec queue.
> >
> > Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
> > ---
> > drivers/gpu/drm/xe/regs/xe_engine_regs.h |  1 +
> > drivers/gpu/drm/xe/regs/xe_oa_regs.h     |  3 +
> > drivers/gpu/drm/xe/xe_oa.c               | 81 +++++++++++++++++++++++-
> > 3 files changed, 82 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > index 76c0938df05f3..045f9773f01f4 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
> > @@ -73,6 +73,7 @@
> >
> > #define RING_CONTEXT_CONTROL(base)		XE_REG((base) + 0x244, XE_REG_OPTION_MASKED)
> > #define	  CTX_CTRL_OAC_CONTEXT_ENABLE		REG_BIT(8)
> > +#define	  CTX_CTRL_RUN_ALONE			REG_BIT(7)
> > #define	  CTX_CTRL_INHIBIT_SYN_CTX_SWITCH	REG_BIT(3)
> > #define	  CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT	REG_BIT(0)
> >
> > diff --git a/drivers/gpu/drm/xe/regs/xe_oa_regs.h b/drivers/gpu/drm/xe/regs/xe_oa_regs.h
> > index 7e2e875ccf80a..b66cd95b795e7 100644
> > --- a/drivers/gpu/drm/xe/regs/xe_oa_regs.h
> > +++ b/drivers/gpu/drm/xe/regs/xe_oa_regs.h
> > @@ -74,6 +74,9 @@
> > #define  OAG_OASTATUS_BUFFER_OVERFLOW	REG_BIT(1)
> > #define  OAG_OASTATUS_REPORT_LOST	REG_BIT(0)
> >
> > +/* OAC unit */
> > +#define OAC_OACONTROL			XE_REG(0x15114)
> > +
> > /* OAM unit */
> > #define OAM_HEAD_POINTER_OFFSET			(0x1a0)
> > #define OAM_TAIL_POINTER_OFFSET			(0x1a4)
> > diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
> > index 9d653d7722d1a..42f32d4359f2c 100644
> > --- a/drivers/gpu/drm/xe/xe_oa.c
> > +++ b/drivers/gpu/drm/xe/xe_oa.c
> > @@ -449,6 +449,19 @@ u32 __format_to_oactrl(const struct xe_oa_format *format, int counter_sel_mask)
> >		REG_FIELD_PREP(OA_OACONTROL_COUNTER_SIZE_MASK, format->counter_size);
> > }
> >
> > +static u32 __oa_ccs_select(struct xe_oa_stream *stream)
> > +{
> > +	u32 val;
> > +
> > +	if (stream->hwe->class != XE_ENGINE_CLASS_COMPUTE)
> > +		return 0;
> > +
> > +	val = REG_FIELD_PREP(OAG_OACONTROL_OA_CCS_SELECT_MASK, stream->hwe->instance);
> > +	xe_assert(stream->oa->xe,
> > +		  REG_FIELD_GET(OAG_OACONTROL_OA_CCS_SELECT_MASK, val) == stream->hwe->instance);
>
> Why is there a need to do REG_FIELD_GET? I thought the REG_FIELD_PREP is
> just a bitwise operation. Are you expecting coherency issues?

No, the check is that hwe->instance can fit in 3 bits
(OAG_OACONTROL_OA_CCS_SELECT_MASK).

>
> > +}
> > +
> > static void xe_oa_enable(struct xe_oa_stream *stream)
> > {
> >	const struct xe_oa_format *format = stream->oa_buffer.format;
> > @@ -463,7 +476,7 @@ static void xe_oa_enable(struct xe_oa_stream *stream)
> >
> >	regs = __oa_regs(stream);
> >	val = __format_to_oactrl(format, regs->oa_ctrl_counter_select_mask) |
> > -		OAG_OACONTROL_OA_COUNTER_ENABLE;
> > +		__oa_ccs_select(stream) | OAG_OACONTROL_OA_COUNTER_ENABLE;
> >
> >	xe_mmio_write32(stream->gt, regs->oa_ctrl, val);
> > }
> > @@ -762,6 +775,64 @@ static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable)
> >	return xe_oa_modify_self(stream, regs_lri, ARRAY_SIZE(regs_lri));
> > }
> >
> > +static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable)
> > +{
> > +	const struct xe_oa_format *format = stream->oa_buffer.format;
> > +	struct xe_lrc *lrc = &stream->exec_q->lrc[0];
> > +	u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32);
> > +	u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) |
> > +		(enable ? OAR_OACONTROL_COUNTER_ENABLE : 0);
> > +	struct flex regs_context[] = {
> > +		{
> > +			OACTXCONTROL(stream->hwe->mmio_base),
> > +			stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1,
> > +			enable ? OA_COUNTER_RESUME : 0,
> > +		},
> > +		{
> > +			RING_CONTEXT_CONTROL(stream->hwe->mmio_base),
> > +			regs_offset + CTX_CONTEXT_CONTROL,
> > +			_MASKED_FIELD(CTX_CTRL_OAC_CONTEXT_ENABLE,
> > +				      enable ? CTX_CTRL_OAC_CONTEXT_ENABLE : 0) |
> > +			_MASKED_FIELD(CTX_CTRL_RUN_ALONE,
> > +				      enable ? CTX_CTRL_RUN_ALONE : 0),
> > +		},
> > +	};
> > +	/* Offsets in regs_lri are not used since this configuration is applied using LRI */
> > +	struct flex regs_lri[] = {
> > +		{
> > +			OAC_OACONTROL,
> > +			OAR_OAC_OACONTROL_OFFSET + 1,
> > +			oacontrol,
> > +		},
> > +	};
> > +	int err;
> > +
> > +	/* Set ccs select to enable programming of OAC_OACONTROL */
> > +	xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_ctrl, __oa_ccs_select(stream));
> > +
> > +	/* Modify stream hwe context image with regs_context */
> > +	err = xe_oa_modify_context(stream, &stream->exec_q->lrc[0],
> > +				   regs_context, ARRAY_SIZE(regs_context));
> > +	if (err)
> > +		return err;
> > +
> > +	/* Apply regs_lri using LRI */
> > +	return xe_oa_modify_self(stream, regs_lri, ARRAY_SIZE(regs_lri));
>
> I think in i915, for execlist scheduling, there was a kernel context that
> was scheduled and when it ran, it would indicate that all other contexts
> are done executing - kinda GPU idle. The modify self was (IMO) only needed
> to update the kernel context in this scenario.

Hmm, in i915 the modify_self uses the "pinned context" (not the kernel
context) afaict. Anyway there are significant differences between this code
in Xe vs the same code in i915 (due to the exclusive use of
stream->k_exec_q in Xe).

> GuC does not have a concept of kernel context (at least not in i915, not
> sure if things changed in XE). If so, all the modify self can be dropped
> (both from OAR and OAC).

I tried replacing modify_self with an MMIO write to
OAR_OACONTROL/OAC_OACONTROL, but it doesn't work. So it seems those
registers can only be programmed via MI_LOAD_REGISTER commands, not by
xe_mmio_write32.

So what I have done instead is, just rename modify_self and refactor the
code a bit.

> Otherwise, this is good too, so
>
> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>
> The other query I have (unrelated to this patch) is if we need the
> PWR_CLK_STATE state configured in all contexts. It's a bit hazy if that was
> only needed for older gens or if it is applicable to newer
> platforms. (gen12_configure_all_contexts() in i915).

Lionel confirmed that PWR_CLK_STATE is not needed for Gen12+ (only needed
for Gen11 and older).

Thanks.
--
Ashutosh

> > +}
> > +
> > +static int xe_oa_configure_oa_context(struct xe_oa_stream *stream, bool enable)
> > +{
> > +	switch (stream->hwe->class) {
> > +	case XE_ENGINE_CLASS_RENDER:
> > +		return xe_oa_configure_oar_context(stream, enable);
> > +	case XE_ENGINE_CLASS_COMPUTE:
> > +		return xe_oa_configure_oac_context(stream, enable);
> > +	default:
> > +		/* Video engines do not support MI_REPORT_PERF_COUNT */
> > +		return 0;
> > +	}
> > +}
> > +
> > #define HAS_OA_BPC_REPORTING(xe) (GRAPHICS_VERx100(xe) >= 1255)
> >
> > static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
> > @@ -781,7 +852,7 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream)
> >
> >	/* disable the context save/restore or OAR counters */
> >	if (stream->exec_q)
> > -		xe_oa_configure_oar_context(stream, false);
> > +		xe_oa_configure_oa_context(stream, false);
> >
> >	/* Make sure we disable noa to save power. */
> >	xe_mmio_rmw32(stream->gt, RPM_CONFIG1, GT_NOA_ENABLE, 0);
> > @@ -978,8 +1049,9 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream)
> >
> >	xe_mmio_rmw32(stream->gt, XELPMP_SQCNT1, 0, sqcnt1);
> >
> > +	/* Configure OAR/OAC */
> >	if (stream->exec_q) {
> > -		ret = xe_oa_configure_oar_context(stream, true);
> > +		ret = xe_oa_configure_oa_context(stream, true);
> >		if (ret)
> >			return ret;
> >	}
> > @@ -1636,6 +1708,9 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, struct drm_file
> >		param.exec_q = xe_exec_queue_lookup(xef, param.exec_queue_id);
> >		if (XE_IOCTL_DBG(oa->xe, !param.exec_q))
> >			return -ENOENT;
> > +
> > +		if (param.exec_q->width > 1)
> > +			drm_dbg(&oa->xe->drm, "exec_q->width > 1, programming only exec_q->lrc[0]\n");
> >	}
> >
> >	/*
> > --
> > 2.41.0
> >


More information about the Intel-xe mailing list