[Intel-gfx] [PATCH v16 15/17] drm/i915/perf: allow NOA muxes reprogramming before workloads

Fri Jun 9 18:17:46 UTC 2017

On 5 June 2017 at 15:48, Lionel Landwerlin
<lionel.g.landwerlin at intel.com> wrote:
> Dynamic slices/subslices shutdown will effectivelly loose the NOA
> configuration uploaded in the slices/subslices.
>
> Here we introduce a new parameter to configure the i915 perf driver
> when userspace wants to monitor parts of the GPU within
> slices/subslices and it knows that some of the workloads running on
> the system will disable slices/subslices.
>
> This new configuration option will force the i915 driver to reemit NOA
> configurations between context switches.
>
> v2: Make sure we handle configs with more register writes than the max
>     MI_LOAD_REGISTER_IMM can do (Lionel)
>
> v3: Introduce new parameter to the perf driver to make reprogramming
>     optional (Lionel)
>     Fix off by one calculation of number of required
>     MI_LOAD_REGISTER_IMM (Lionel)
>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  15 +++++
>  drivers/gpu/drm/i915/i915_perf.c        | 100 ++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.c |   3 +
>  include/uapi/drm/i915_drm.h             |  10 ++++
>  4 files changed, 128 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d53419e9bfa2..7ef1908e3a98 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1964,6 +1964,12 @@ struct i915_perf_stream {
>         struct i915_gem_context *ctx;
>
>         /**
> +        * @noa_restore: Whether the user who opened the stream requested NOA
> +        * config to be restored between context switches.
> +        */
> +       bool noa_restore;
> +
> +       /**
>          * @enabled: Whether the stream is currently enabled, considering
>          * whether the stream was opened in a disabled state and based
>          * on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE` calls.
> @@ -2401,6 +2407,7 @@ struct drm_i915_private {
>                         const struct i915_oa_reg *mux_regs[6];
>                         int mux_regs_lens[6];
>                         int n_mux_configs;
> +                       int total_n_mux_regs;
>
>                         const struct i915_oa_reg *b_counter_regs;
>                         int b_counter_regs_len;
> @@ -2493,6 +2500,13 @@ struct drm_i915_private {
>                         struct i915_oa_ops ops;
>                         const struct i915_oa_format *oa_formats;
>                         int n_builtin_sets;
> +
> +                       /**
> +                        * Whether the user has requested the NOA
> +                        * configuration to be reprogrammed between context
> +                        * switches.
> +                        */
> +                       atomic_t noa_restore;
>                 } oa;
>         } perf;
>
> @@ -3543,6 +3557,7 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>  void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>                             struct i915_gem_context *ctx,
>                             uint32_t *reg_state);
> +int i915_oa_emit_noa_config_locked(struct drm_i915_gem_request *req);
>
>  /* i915_gem_evict.c */
>  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 5222dac1ee5b..3f49ce69641b 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -350,6 +350,8 @@ struct perf_open_properties {
>         u64 single_context:1;
>         u64 ctx_handle;
>
> +       bool noa_restore;
We need kernel doc for this, as pointed out by kbuilld.

> +
>         /* OA sampling state */
>         int metrics_set;
>         int oa_format;
> @@ -1451,11 +1453,24 @@ static void config_oa_regs(struct drm_i915_private *dev_priv,
>         }
>  }
>
> +static void count_total_mux_regs(struct drm_i915_private *dev_priv)
> +{
> +       int i;
> +
> +       dev_priv->perf.oa.total_n_mux_regs = 0;
> +       for (i = 0; i < dev_priv->perf.oa.n_mux_configs; i++) {
> +               dev_priv->perf.oa.total_n_mux_regs +=
> +                       dev_priv->perf.oa.mux_regs_lens[i];
> +       }
> +}
> +
>  static int hsw_enable_metric_set(struct drm_i915_private *dev_priv)
>  {
>         int ret = i915_oa_select_metric_set_hsw(dev_priv);
>         int i;
>
> +       count_total_mux_regs(dev_priv);
> +
>         if (ret)
>                 return ret;
>
> @@ -1777,6 +1792,8 @@ static int gen8_enable_metric_set(struct drm_i915_private *dev_priv)
>         int ret = dev_priv->perf.oa.ops.select_metric_set(dev_priv);
>         int i;
>
> +       count_total_mux_regs(dev_priv);
> +
>         if (ret)
>                 return ret;
>
> @@ -2065,6 +2082,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
>                         return ret;
>         }
>
> +       if (props->noa_restore) {
> +               stream->noa_restore = true;
> +               atomic_inc(&dev_priv->perf.oa.noa_restore);
You need to undo this in the onion teardown.

> +       }
> +
>         ret = alloc_oa_buffer(dev_priv);
>         if (ret)
>                 goto err_oa_buf_alloc;
> @@ -2121,6 +2143,74 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>         gen8_update_reg_state_unlocked(ctx, reg_state);
>  }
>
> +int i915_oa_emit_noa_config_locked(struct drm_i915_gem_request *req)
> +{
> +       struct drm_i915_private *dev_priv = req->i915;
> +       int max_load = 125;
Why 125?

> +       int n_lri, n_registers, n_loaded_register;
> +       int i, j;
> +       u32 *cs;
> +
> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +
> +       /* Perf not supported. */
> +       if (!dev_priv->perf.initialized)
> +               return 0;
> +
> +       /*
> +        * We do not expect dynamic slice/subslice configuration to change
> +        * across contexts prior to Gen8.
> +        */
> +       if (INTEL_GEN(dev_priv) < 8)
> +               return 0;
> +
> +       /* Has the user requested that NOA configuration be restored? */
> +       if (!atomic_read(&dev_priv->perf.oa.noa_restore))
> +               return 0;
> +
> +       n_registers = dev_priv->perf.oa.total_n_mux_regs;
> +       n_lri = (n_registers / max_load) + (n_registers % max_load) != 0;
> +
> +       cs = intel_ring_begin(req,
> +                             3 * 2 + /* MI_LOAD_REGISTER_IMM for chicken registers */
> +                             n_lri + /* MI_LOAD_REGISTER_IMM for mux registers */
> +                             n_registers * 2 + /* offset & value for mux registers*/
> +                             1 /* NOOP */);
> +       if (IS_ERR(cs))
> +               return PTR_ERR(cs);
Do we wan't to silently fail here, shouldn't we be screaming and shouting?

Also do we have to do re-upload the noa config unconditionally, is it
not feasible to only do so when there is a change in the slice/sublice
configuration between contexts?

> +
> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
> +       *cs++ = i915_mmio_reg_offset(GDT_CHICKEN_BITS);
> +       *cs++ = 0xA0;
> +
> +       n_loaded_register = 0;
> +       for (i = 0; i < dev_priv->perf.oa.n_mux_configs; i++) {
> +               const struct i915_oa_reg *mux_regs =
> +                       dev_priv->perf.oa.mux_regs[i];
> +               const int mux_regs_len = dev_priv->perf.oa.mux_regs_lens[i];
> +
> +               for (j = 0; j < mux_regs_len; j++) {
> +                       if ((n_loaded_register % max_load) == 0) {
> +                               n_lri = min(n_registers - n_loaded_register, max_load);
> +                               *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
> +                       }
> +
> +                       *cs++ = i915_mmio_reg_offset(mux_regs[j].addr);
> +                       *cs++ = mux_regs[j].value;
> +                       n_loaded_register++;
> +               }
> +       }
> +
> +       *cs++ = MI_LOAD_REGISTER_IMM(1);
> +       *cs++ = i915_mmio_reg_offset(GDT_CHICKEN_BITS);
> +       *cs++ = 0x80;
> +
> +       *cs++ = MI_NOOP;
> +       intel_ring_advance(req, cs);
> +
> +       return 0;
> +}
> +
>  /**
>   * i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
>   * @stream: An i915 perf stream
> @@ -2433,9 +2523,14 @@ static long i915_perf_ioctl(struct file *file,
>   */
>  static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
>  {
> +       struct drm_i915_private *dev_priv = stream->dev_priv;
> +
>         if (stream->enabled)
>                 i915_perf_disable_locked(stream);
>
> +       if (stream->noa_restore)
> +               atomic_dec(&dev_priv->perf.oa.noa_restore);
> +
>         if (stream->ops->destroy)
>                 stream->ops->destroy(stream);
>
> @@ -2769,6 +2864,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
>                         props->oa_periodic = true;
>                         props->oa_period_exponent = value;
>                         break;
> +               case DRM_I915_PERF_PROP_NOA_RESTORE:
> +                       props->noa_restore = true;
> +                       break;
>                 case DRM_I915_PERF_PROP_MAX:
>                         MISSING_CASE(id);
>                         return -EINVAL;
> @@ -3129,6 +3227,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
>                 mutex_init(&dev_priv->perf.lock);
>                 spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
>
> +               atomic_set(&dev_priv->perf.oa.noa_restore, 0);
> +
>                 oa_sample_rate_hard_limit =
>                         dev_priv->perf.oa.timestamp_frequency / 2;
>                 dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index acd1da9b62a3..67aaaebb194b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1874,6 +1874,9 @@ gen8_emit_bb_start(struct drm_i915_gem_request *req,
>                         !(dispatch_flags & I915_DISPATCH_SECURE);
>         u32 *cs;
>
> +       /* Emit NOA config */
> +       i915_oa_emit_noa_config_locked(req);
> +
>         cs = intel_ring_begin(req, 4);
>         if (IS_ERR(cs))
>                 return PTR_ERR(cs);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 15bc9f78ba4d..27cc72a85d07 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1366,6 +1366,16 @@ enum drm_i915_perf_property_id {
>          */
>         DRM_I915_PERF_PROP_OA_EXPONENT,
>
> +       /**
> +        * Specifying this property will alterate the behavior of the i915
behaviour

> +        * driver, by forcing the driver to restore the NOA configuration of
> +        * the OA unit between context switches. You should only use this flag
> +        * if you think that that at least one application running on the
> +        * system is using a SSEU configurations with disabled
> +        * slices/subslices.
> +        */
> +       DRM_I915_PERF_PROP_NOA_RESTORE,
> +
>         DRM_I915_PERF_PROP_MAX /* non-ABI */
>  };
>
> --
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx