[Intel-gfx] [PATCH v3 2/2] drm/i915/perf: fix ctx_id read with GuC & ICL

Sat Jun 2 08:23:25 UTC 2018

Quoting Lionel Landwerlin (2018-06-02 01:40:29)
> One thing we didn't really understand about the OA report is that the
> ContextID field (dword 2) is copy of the context descriptor (dword 1).
> 
> On Gen8->10 and without using GuC we didn't notice the issue because
> we only checked the 21bits of the ContextID field in the OA reports
> which matches exactly the hw_id stored into the context descriptor.
> 
> When using GuC submission we have an issue of a non matching hw_id
> because GuC uses bit 20 of the hw_id to signal proxy submission. This
> change introduces a mask to compare only the relevant bits.
> 
> On ICL the context descriptor format has changed and we failed to
> address this. On top of using a mask we also need to shift the bits
> properly.
> 
> v2: Reuse lrc_desc rather than recomputing part of it (Chris/Michel)
> 
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
> BSpec: 1237
> Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
> ---
>  drivers/gpu/drm/i915/i915_drv.h  |   1 +
>  drivers/gpu/drm/i915/i915_perf.c | 126 +++++++++++++++++++++++--------
>  drivers/gpu/drm/i915/intel_lrc.c |   5 ++
>  3 files changed, 102 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 0ae88b671e8d..c568160ea53c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1951,6 +1951,7 @@ struct drm_i915_private {
>  
>                         struct intel_context *pinned_ctx;
>                         u32 specific_ctx_id;
> +                       u32 specific_ctx_id_mask;
>  
>                         struct hrtimer poll_check_timer;
>                         wait_queue_head_t poll_wq;
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 4f0eb84b3c00..48124d206c79 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -737,12 +737,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
>                         continue;
>                 }
>  
> -               /*
> -                * XXX: Just keep the lower 21 bits for now since I'm not
> -                * entirely sure if the HW touches any of the higher bits in
> -                * this field
> -                */
> -               ctx_id = report32[2] & 0x1fffff;
> +               ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
>  
>                 /*
>                  * Squash whatever is in the CTX_ID field if it's marked as
> @@ -1203,6 +1198,33 @@ static int i915_oa_read(struct i915_perf_stream *stream,
>         return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
>  }
>  
> +static struct intel_context *oa_pin_context(struct drm_i915_private *i915,
> +                                           struct i915_gem_context *ctx)
> +{
> +       struct intel_engine_cs *engine = i915->engine[RCS];
> +       struct intel_context *ce;
> +       int ret;
> +
> +       ret = i915_mutex_lock_interruptible(&i915->drm);
> +       if (ret)
> +               return ERR_PTR(ret);
> +
> +       /*
> +        * As the ID is the gtt offset of the context's vma we
> +        * pin the vma to ensure the ID remains fixed.
> +        *
> +        * NB: implied RCS engine...
> +        */
> +       ce = intel_context_pin(ctx, engine);
> +       mutex_unlock(&i915->drm.struct_mutex);
> +       if (IS_ERR(ce))
> +               return ce;
> +
> +       i915->perf.oa.pinned_ctx = ce;
> +
> +       return ce;
> +}
> +
>  /**
>   * oa_get_render_ctx_id - determine and hold ctx hw id
>   * @stream: An i915-perf stream opened for OA metrics
> @@ -1215,40 +1237,83 @@ static int i915_oa_read(struct i915_perf_stream *stream,
>   */
>  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
>  {
> -       struct drm_i915_private *dev_priv = stream->dev_priv;
> +       struct drm_i915_private *i915 = stream->dev_priv;
>  
> -       if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
> -               dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
> -       } else {
> -               struct intel_engine_cs *engine = dev_priv->engine[RCS];
> +       switch (INTEL_GEN(i915)) {
> +       case 7: {
>                 struct intel_context *ce;
> -               int ret;
> -
> -               ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> -               if (ret)
> -                       return ret;
>  
> -               /*
> -                * As the ID is the gtt offset of the context's vma we
> -                * pin the vma to ensure the ID remains fixed.
> -                *
> -                * NB: implied RCS engine...
> -                */
> -               ce = intel_context_pin(stream->ctx, engine);
> -               mutex_unlock(&dev_priv->drm.struct_mutex);
> +               ce = oa_pin_context(i915, stream->ctx);
>                 if (IS_ERR(ce))
>                         return PTR_ERR(ce);
>  
> -               dev_priv->perf.oa.pinned_ctx = ce;
> -
>                 /*
> -                * Explicitly track the ID (instead of calling
> -                * i915_ggtt_offset() on the fly) considering the difference
> -                * with gen8+ and execlists
> +                * On Haswell we don't do any post processing of the reports
> +                * and don't need to use the mask.
>                  */
> -               dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
> +               i915->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
> +               i915->perf.oa.specific_ctx_id_mask = 0;
> +               break;
>         }
>  
> +       case 8:
> +       case 9:
> +       case 10:
> +               if (USES_GUC_SUBMISSION(i915)) {
> +                       struct intel_context *ce;
> +
> +                       ce = oa_pin_context(i915, stream->ctx);
> +                       if (IS_ERR(ce))
> +                               return PTR_ERR(ce);
> +
> +                       /*
> +                        * When using GuC, the context descriptor we write in
> +                        * i915 is read by GuC and rewritten before it's
> +                        * actually written into the hardware. The LRCA is
> +                        * what is put into the context id field of the
> +                        * context descriptor by GuC. Because it's aligned to
> +                        * a page, the lower 12bits are always at 0 and
> +                        * dropped by GuC. They won't be part of the context
> +                        * ID in the OA reports, so squash those lower bits.
> +                        */
> +                       i915->perf.oa.specific_ctx_id =
> +                               lower_32_bits(ce->lrc_desc) >> 12;
> +
> +                       /*
> +                        * GuC uses the top bit to signal proxy submission, so
> +                        * ignore that bit.
> +                        */
> +                       i915->perf.oa.specific_ctx_id_mask =
> +                               (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
> +               } else {
> +                       i915->perf.oa.specific_ctx_id = stream->ctx->hw_id;

Plan for the hw_id not being valid unless pinned. Just the pin the
context everywhere and use ce for deriving the specific_ctx_id on all
paths.
-Chris