[PATCH 2/2] xe/oa: Use MI_LOAD_REGISTER_IMMEDIATE to enable OAR/OAC

Wed Dec 18 03:32:19 UTC 2024

On Mon, 16 Dec 2024 16:58:18 -0800, Umesh Nerlige Ramappa wrote:
>

Hi Umesh,

> To enable OAR/OAC, a bit in RING_CONTEXT_CONTROL needs to be set.
> Setting this bit cause the context image size to change and if not done
> correct, can cause undesired hangs.
>
> Current code uses a separate exec_queue to modify this bit and is
> error-prone. As per HW recommendation, submit MI_LOAD_REGISTER_IMM to
> the target hardware context to modify the relevant bit.
>
> In v2 version, an attempt to submit everything to the user-queue was
> made, but it failed the unprivileged-single-ctx-counters test. It
> appears that the OACTXCONTROL must be modified from a remote context.
>
> In summary,
> - the OACTXCONTROL is always modified from the k_exec_q
> - the LRI registers are always modified from the user-queue
> - emit_oa_config picks user-queue if available, else it uses k_exec_q.

Thanks for the detailed explanation. So this now brings the exec_q use in
Xe the same as i915.

I have a couple of nitpick's below, which are entirely my personal
preferences, you don't need to necessarily follow them, so feel free to
ignore them.

Also 'checkpatch --strict' is complaining about some whitespace errors so
please fix those too.

>
> v2:
> (Matt)
> - set job->ggtt to true if create job is successful
> - unlock vm on job error
>
> (Ashutosh)
> - don't wait on job submission
> - use kernel exec queue where possible

I think we need a Fixes: here and also Cc: stable for both patches.

>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_oa.c | 71 +++++++++++++++++++++++++++-----------
>  1 file changed, 51 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
> index 8dd55798ab31..d6298c6fa78b 100644
> --- a/drivers/gpu/drm/xe/xe_oa.c
> +++ b/drivers/gpu/drm/xe/xe_oa.c
> @@ -596,19 +596,39 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait)
>	return ret;
>  }
>
> -static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa_submit_deps deps,
> +static void xe_oa_lock_vma(struct xe_exec_queue *q)
> +{
> +	if (q->vm) {
> +		down_read(&q->vm->lock);
> +		xe_vm_lock(q->vm, false);
> +	}
> +}
> +
> +static void xe_oa_unlock_vma(struct xe_exec_queue *q)
> +{
> +	if (q->vm) {
> +		xe_vm_unlock(q->vm);
> +		up_read(&q->vm->lock);
> +	}
> +}
> +
> +static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream,
> +					 struct xe_exec_queue *q,
> +					 enum xe_oa_submit_deps deps,
>					 struct xe_bb *bb)
>  {
>	struct xe_sched_job *job;
>	struct dma_fence *fence;
>	int err = 0;
>
> -	/* Kernel configuration is issued on stream->k_exec_q, not stream->exec_q */
> -	job = xe_bb_create_job(stream->k_exec_q, bb);
> +	xe_oa_lock_vma(q);
> +
> +	job = xe_bb_create_job(q, bb);
>	if (IS_ERR(job)) {
>		err = PTR_ERR(job);
>		goto exit;
>	}
> +	job->ggtt = true;
>
>	if (deps == XE_OA_SUBMIT_ADD_DEPS) {
>		for (int i = 0; i < stream->num_syncs && !err; i++)
> @@ -623,10 +643,14 @@ static struct dma_fence *xe_oa_submit_bb(struct xe_oa_stream *stream, enum xe_oa
>	fence = dma_fence_get(&job->drm.s_fence->finished);
>	xe_sched_job_push(job);
>
> +	xe_oa_unlock_vma(q);
> +
>	return fence;
>  err_put_job:
>	xe_sched_job_put(job);
>  exit:
> +	xe_oa_unlock_vma(q);
> +

nit: prefer no empty line here

>	return ERR_PTR(err);
>  }
>
> @@ -692,6 +716,7 @@ static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc,
>  static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lrc,
>				  const struct flex *flex, u32 count)
>  {
> +	struct xe_exec_queue *q = stream->k_exec_q;

So, for the purpose of this series, I am going to ignore that this is being
submitted on k_exec_q, whereas the other two submitted on exec_q.

I think the concern would be if the k_exec_q batch gets delayed wrt to the
other batches, in which case we'll see zero counter values. But since the
only user for this functionality is Mesa and fwiu Mesa enables OA on an
exec_q first before starting submissions on it (since they only ever saw
hangs after OA was disabled, never when OA was enabled). Therefore the HW
engine is idle and therefore it is highly improbable that the k_exec_q
batch will get signficantly stalled behind exec_q batches (if at all it
does).

So for now let's ignore this and get this series merged first. Then we can
see if we need to tackle this multiple exec_q issue.

>	struct dma_fence *fence;
>	struct xe_bb *bb;
>	int err;
> @@ -704,7 +729,7 @@ static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lr
>
>	xe_oa_store_flex(stream, lrc, bb, flex, count);
>
> -	fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb);
> +	fence = xe_oa_submit_bb(stream, q, XE_OA_SUBMIT_NO_DEPS, bb);
>	if (IS_ERR(fence)) {
>		err = PTR_ERR(fence);
>		goto free_bb;
> @@ -719,21 +744,22 @@ static int xe_oa_modify_ctx_image(struct xe_oa_stream *stream, struct xe_lrc *lr
>	return err;
>  }
>
> -static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri)
> +static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *reg_lri, u32 count)
>  {
> +	struct xe_exec_queue *q = stream->exec_q;
>	struct dma_fence *fence;
>	struct xe_bb *bb;
>	int err;
>
> -	bb = xe_bb_new(stream->gt, 3, false);
> +	bb = xe_bb_new(stream->gt, 2 * count + 1, false);
>	if (IS_ERR(bb)) {
>		err = PTR_ERR(bb);
>		goto exit;
>	}
>
> -	write_cs_mi_lri(bb, reg_lri, 1);
> +	write_cs_mi_lri(bb, reg_lri, count);
>
> -	fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_NO_DEPS, bb);
> +	fence = xe_oa_submit_bb(stream, q, XE_OA_SUBMIT_NO_DEPS, bb);
>	if (IS_ERR(fence)) {
>		err = PTR_ERR(fence);
>		goto free_bb;
> @@ -751,8 +777,6 @@ static int xe_oa_load_with_lri(struct xe_oa_stream *stream, struct xe_oa_reg *re
>  static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable)
>  {
>	const struct xe_oa_format *format = stream->oa_buffer.format;
> -	struct xe_lrc *lrc = stream->exec_q->lrc[0];
> -	u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32);
>	u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) |
>		(enable ? OAR_OACONTROL_COUNTER_ENABLE : 0);
>
> @@ -762,13 +786,17 @@ static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable)
>			stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1,
>			enable ? OA_COUNTER_RESUME : 0,
>		},
> +	};
> +	struct xe_oa_reg reg_lri[] = {
> +		{
> +			OAR_OACONTROL,
> +			oacontrol,
> +		},
>		{
>			RING_CONTEXT_CONTROL(stream->hwe->mmio_base),
> -			regs_offset + CTX_CONTEXT_CONTROL,
>			_MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE),

So one thing I think we should do here (and also OAC) is undo the effect of
0c8650b09a36 ("drm/xe/oa: Don't reset OAC_CONTEXT_ENABLE on OA stream
close"), so set the bit(s) to 0 when enable is 0. This is part of the
reason for this patch too, that OA can be cleanly enabled/disabled on an
exec_q.

>		}, };
> -	struct xe_oa_reg reg_lri = { OAR_OACONTROL, oacontrol };
>	int err;
>
>	/* Modify stream hwe context image with regs_context */
> @@ -778,14 +806,12 @@ static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable)
>		return err;
>
>	/* Apply reg_lri using LRI */
> -	return xe_oa_load_with_lri(stream, &reg_lri);
> +	return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri));
>  }
>
>  static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable)
>  {
>	const struct xe_oa_format *format = stream->oa_buffer.format;
> -	struct xe_lrc *lrc = stream->exec_q->lrc[0];
> -	u32 regs_offset = xe_lrc_regs_offset(lrc) / sizeof(u32);
>	u32 oacontrol = __format_to_oactrl(format, OAR_OACONTROL_COUNTER_SEL_MASK) |
>		(enable ? OAR_OACONTROL_COUNTER_ENABLE : 0);
>	struct flex regs_context[] = {
> @@ -794,14 +820,18 @@ static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable)
>			stream->oa->ctx_oactxctrl_offset[stream->hwe->class] + 1,
>			enable ? OA_COUNTER_RESUME : 0,
>		},
> +	};
> +	struct xe_oa_reg reg_lri[] = {
> +		{
> +			OAC_OACONTROL,
> +			oacontrol
> +		},
>		{
>			RING_CONTEXT_CONTROL(stream->hwe->mmio_base),
> -			regs_offset + CTX_CONTEXT_CONTROL,
>			_MASKED_BIT_ENABLE(CTX_CTRL_OAC_CONTEXT_ENABLE) |
>			_MASKED_FIELD(CTX_CTRL_RUN_ALONE, enable ? CTX_CTRL_RUN_ALONE : 0),
>		},
> -	};
> -	struct xe_oa_reg reg_lri = { OAC_OACONTROL, oacontrol };
> +		};

checkpatch error here I believe.

>	int err;
>
>	/* Set ccs select to enable programming of OAC_OACONTROL */
> @@ -815,7 +845,7 @@ static int xe_oa_configure_oac_context(struct xe_oa_stream *stream, bool enable)
>		return err;
>
>	/* Apply reg_lri using LRI */
> -	return xe_oa_load_with_lri(stream, &reg_lri);
> +	return xe_oa_load_with_lri(stream, reg_lri, ARRAY_SIZE(reg_lri));
>  }
>
>  static int xe_oa_configure_oa_context(struct xe_oa_stream *stream, bool enable)
> @@ -1015,6 +1045,7 @@ static const struct dma_fence_ops xe_oa_fence_ops = {
>  static int xe_oa_emit_oa_config(struct xe_oa_stream *stream, struct xe_oa_config *config)
>  {
>  #define NOA_PROGRAM_ADDITIONAL_DELAY_US 500
> +	struct xe_exec_queue *q = stream->exec_q ? : stream->k_exec_q;

nit: prefer no space between ? and :

>	struct xe_oa_config_bo *oa_bo;
>	struct xe_oa_fence *ofence;
>	int i, err, num_signal = 0;
> @@ -1033,7 +1064,7 @@ static int xe_oa_emit_oa_config(struct xe_oa_stream *stream, struct xe_oa_config
>	}
>
>	/* Emit OA configuration batch */
> -	fence = xe_oa_submit_bb(stream, XE_OA_SUBMIT_ADD_DEPS, oa_bo->bb);
> +	fence = xe_oa_submit_bb(stream, q, XE_OA_SUBMIT_ADD_DEPS, oa_bo->bb);
>	if (IS_ERR(fence)) {
>		err = PTR_ERR(fence);
>		goto exit;
> --
> 2.34.1
>

Rest looks good. I'll take a quick look at the next version before R-b'ing.

Thanks.
--
Ashutosh