[Intel-gfx] [PATCH 2/2] drm/i915/perf: fix ctx_id read with GuC & ICL
Michel Thierry
michel.thierry at intel.com
Thu May 31 20:46:19 UTC 2018
On 5/31/2018 12:56 PM, Lionel Landwerlin wrote:
> One thing we didn't really understand about the OA report is that the
> ContextID field (dword 2) is copy of the context descriptor (dword 1).
>
> On Gen8->10 and without using GuC we didn't notice the issue because
> we only checked the 21bits of the ContextID field in the OA reports
> which matches exactly the hw_id stored into the context descriptor.
>
> When using GuC submission we have an issue of a non matching hw_id
> because GuC uses bit 20 of the hw_id to signal proxy submission. This
> change makes introduces a mask to compare only the relevant bits.
Choose one: makes or introduces ;)
>
> On ICL the context descriptor format has changed and we failed to
> address this. On top of using a mask we also need to shift the bits
> properly.
>
Someone is going to complain this should be two patches (one to address
the GuC-ness and one for Gen11), but not me.
Reviewed-by: Michel Thierry <michel.thierry at intel.com>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
> BSpec: 1237
> Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
> ---
> drivers/gpu/drm/i915/i915_drv.h | 1 +
> drivers/gpu/drm/i915/i915_perf.c | 123 ++++++++++++++++++++++++-------
> drivers/gpu/drm/i915/intel_lrc.c | 5 ++
> 3 files changed, 101 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 58ab9259fb73..0ccda488a8db 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1953,6 +1953,7 @@ struct drm_i915_private {
>
> struct intel_context *pinned_ctx;
> u32 specific_ctx_id;
> + u32 specific_ctx_id_mask;
>
> struct hrtimer poll_check_timer;
> wait_queue_head_t poll_wq;
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 4a62024cbf85..d5e5d4635f1f 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
> continue;
> }
>
> - /*
> - * XXX: Just keep the lower 21 bits for now since I'm not
> - * entirely sure if the HW touches any of the higher bits in
> - * this field
> - */
> - ctx_id = report32[2] & 0x1fffff;
> + ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
>
> /*
> * Squash whatever is in the CTX_ID field if it's marked as
> @@ -1204,6 +1199,36 @@ static int i915_oa_read(struct i915_perf_stream *stream,
> return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
> }
>
> +static int oa_get_render_lrca(struct drm_i915_private *i915,
> + struct i915_gem_context *ctx,
> + u32 *lrca)
> +{
> + struct intel_engine_cs *engine = i915->engine[RCS];
> + struct intel_context *ce;
> + int ret;
> +
> + ret = i915_mutex_lock_interruptible(&i915->drm);
> + if (ret)
> + return ret;
> +
> + /*
> + * As the ID is the gtt offset of the context's vma we
> + * pin the vma to ensure the ID remains fixed.
> + *
> + * NB: implied RCS engine...
> + */
> + ce = intel_context_pin(ctx, engine);
> + mutex_unlock(&i915->drm.struct_mutex);
> + if (IS_ERR(ce))
> + return PTR_ERR(ce);
> +
> + i915->perf.oa.pinned_ctx = ce;
> +
> + *lrca = i915_ggtt_offset(ce->state);
> +
> + return 0;
> +}
> +
> /**
> * oa_get_render_ctx_id - determine and hold ctx hw id
> * @stream: An i915-perf stream opened for OA metrics
> @@ -1216,40 +1241,81 @@ static int i915_oa_read(struct i915_perf_stream *stream,
> */
> static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
> {
> - struct drm_i915_private *dev_priv = stream->dev_priv;
> + struct drm_i915_private *i915 = stream->dev_priv;
>
> - if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
> - dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
> - } else {
> - struct intel_engine_cs *engine = dev_priv->engine[RCS];
> - struct intel_context *ce;
> + switch (INTEL_GEN(i915)) {
> + case 7: {
> int ret;
>
> - ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> + ret = oa_get_render_lrca(i915, stream->ctx,
> + &i915->perf.oa.specific_ctx_id);
> if (ret)
> return ret;
>
> /*
> - * As the ID is the gtt offset of the context's vma we
> - * pin the vma to ensure the ID remains fixed.
> - *
> - * NB: implied RCS engine...
> + * On Haswell we don't do any post processing of the reports
> + * and don't need to use the mask.
> */
> - ce = intel_context_pin(stream->ctx, engine);
> - mutex_unlock(&dev_priv->drm.struct_mutex);
> - if (IS_ERR(ce))
> - return PTR_ERR(ce);
> + i915->perf.oa.specific_ctx_id_mask = 0;
> + break;
> + }
>
> - dev_priv->perf.oa.pinned_ctx = ce;
> + case 8:
> + case 9:
> + case 10:
> + if (USES_GUC_SUBMISSION(i915)) {
> + u32 lrca;
> + int ret;
>
> - /*
> - * Explicitly track the ID (instead of calling
> - * i915_ggtt_offset() on the fly) considering the difference
> - * with gen8+ and execlists
> - */
> - dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
> + ret = oa_get_render_lrca(i915, stream->ctx, &lrca);
> + if (ret)
> + return ret;
> +
> + /*
> + * The LRCA is aligned to a page. As a result the
> + * lower 12bits are always at 0 and reused in the
> + * context descriptor for some flags. They won't be
> + * part of the context ID in the OA reports, so squash
> + * those lower bits.
> + */
> + i915->perf.oa.specific_ctx_id =
> + (lrca + LRC_HEADER_PAGES * PAGE_SIZE) >> 12;
> +
> + /*
> + * GuC uses the top bit to signal proxy submission, so
> + * ignore that bit if using GuC.
> + */
> + i915->perf.oa.specific_ctx_id_mask =
> + (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
> + } else {
> + i915->perf.oa.specific_ctx_id = stream->ctx->hw_id;
> + i915->perf.oa.specific_ctx_id_mask =
> + (1U << GEN8_CTX_ID_WIDTH) - 1;
> + }
> + break;
> +
> + case 11: {
> + struct intel_engine_cs *engine = i915->engine[RCS];
> +
> + i915->perf.oa.specific_ctx_id =
> + stream->ctx->hw_id << (GEN11_SW_CTX_ID_SHIFT - 32) |
> + engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
> + engine->class << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
> + i915->perf.oa.specific_ctx_id_mask =
> + ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32) |
> + ((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
> + ((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << (GEN11_ENGINE_CLASS_SHIFT - 32);
> + break;
> }
>
> + default:
> + MISSING_CASE(INTEL_GEN(i915));
> + }
> +
> + DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
> + i915->perf.oa.specific_ctx_id,
> + i915->perf.oa.specific_ctx_id_mask);
> +
> return 0;
> }
>
> @@ -1266,6 +1332,7 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
> struct intel_context *ce;
>
> dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
> + dev_priv->perf.oa.specific_ctx_id_mask = 0;
>
> ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
> if (ce) {
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 94f9c4795190..7d63c6d2f687 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -233,6 +233,11 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
> /* bits 12-31 */
> GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
>
> + /*
> + * The following 32bits are copied into the OA reports (dword 2).
> + * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
> + * anything below.
> + */
> if (INTEL_GEN(ctx->i915) >= 11) {
> GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
> desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;
>
More information about the Intel-gfx
mailing list