[Intel-gfx] [PATCH 07/10] drm/i915: Gate engine stats collection with a static key

Tue Oct 3 10:17:25 UTC 2017

Quoting Tvrtko Ursulin (2017-09-29 13:34:57)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> This reduces the cost of the software engine busyness tracking
> to a single no-op instruction when there are no listeners.
> 
> We add a new i915 ordered workqueue to be used only for tasks
> not needing struct mutex.
> 
> v2: Rebase and some comments.
> v3: Rebase.
> v4: Checkpatch fixes.
> v5: Rebase.
> v6: Use system_long_wq to avoid being blocked by struct_mutex
>     users.
> v7: Fix bad conflict resolution from last rebase. (Dmitry Rogozhkin)
> v8: Rebase.
> v9:
>  * Fix race between unordered enable followed by disable.
>    (Chris Wilson)
>  * Prettify order of local variable declarations. (Chris Wilson)

Ok, I can't see a downside to enabling the optimisation even if it will
be global and not per-device/per-engine.

> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  13 +++-
>  drivers/gpu/drm/i915/i915_drv.h         |   5 ++
>  drivers/gpu/drm/i915/i915_pmu.c         |  57 ++++++++++++++++--
>  drivers/gpu/drm/i915/intel_engine_cs.c  |  17 ++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h | 101 ++++++++++++++++++++------------
>  5 files changed, 150 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 187c0fad1b79..1e43d594b59a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -795,12 +795,22 @@ static int i915_workqueues_init(struct drm_i915_private *dev_priv)
>         if (dev_priv->wq == NULL)
>                 goto out_err;
>  
> +       /*
> +        * Ordered workqueue for tasks not needing the global mutex but
> +        * which still need to be serialized.
> +        */
> +       dev_priv->wq_misc = alloc_ordered_workqueue("i915-misc", 0);
> +       if (dev_priv->wq == NULL)

if (!dev_priv->wq_misc) I think you mean ;)

> +               goto out_free_wq;

Ok, Tried a few other names, liked none better.

> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 93c0e7ec7d75..4c9b9ae09b23 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -458,11 +458,20 @@ static void i915_pmu_enable(struct perf_event *event)
>                 GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
>                 GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
>                 if (engine->pmu.enable_count[sample]++ == 0) {
> +                       /*
> +                        * Enable engine busy stats tracking if needed or
> +                        * alternatively cancel the scheduled disabling of the
> +                        * same.
> +                        *
> +                        * If the delayed disable was pending, cancel it and in
> +                        * this case no need to queue a new enable.
> +                        */
>                         if (engine_needs_busy_stats(engine) &&
>                             !engine->pmu.busy_stats) {
> -                               engine->pmu.busy_stats =
> -                                       intel_enable_engine_stats(engine) == 0;
> -                               WARN_ON_ONCE(!engine->pmu.busy_stats);
> +                               engine->pmu.busy_stats = true;
> +                               if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
> +                                       queue_work(i915->wq_misc,
> +                                                  &engine->pmu.enable_busy_stats);
>                         }
>                 }
>         }
> @@ -505,7 +514,15 @@ static void i915_pmu_disable(struct perf_event *event)
>                         if (!engine_needs_busy_stats(engine) &&
>                             engine->pmu.busy_stats) {
>                                 engine->pmu.busy_stats = false;
> -                               intel_disable_engine_stats(engine);
> +                               /*
> +                                * We request a delayed disable to handle the
> +                                * rapid on/off cycles on events which can
> +                                * happen when tools like perf stat start in a
> +                                * nicer way.
> +                                */

Ah, so this is where the work ordering plays a key role. A comment that
we depend on strict ordering within the workqueue to prevent the delayed
disable overtaking the enable worker (even when the workqueue jumps
between cpus).

> +                               queue_delayed_work(i915->wq_misc,
> +                                                  &engine->pmu.disable_busy_stats,
> +                                                  round_jiffies_up_relative(HZ));
>                         }
>                 }
>         }
> @@ -689,8 +706,26 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
>         return 0;
>  }
>  
> +static void __enable_busy_stats(struct work_struct *work)
> +{
> +       struct intel_engine_cs *engine =
> +               container_of(work, typeof(*engine), pmu.enable_busy_stats);
> +
> +       WARN_ON_ONCE(intel_enable_engine_stats(engine));
> +}
> +
> +static void __disable_busy_stats(struct work_struct *work)
> +{
> +       struct intel_engine_cs *engine =
> +              container_of(work, typeof(*engine), pmu.disable_busy_stats.work);
> +
> +       intel_disable_engine_stats(engine);
> +}
> +
>  void i915_pmu_register(struct drm_i915_private *i915)
>  {
> +       struct intel_engine_cs *engine;
> +       enum intel_engine_id id;
>         int ret;
>  
>         if (INTEL_GEN(i915) <= 2) {
> @@ -725,6 +760,12 @@ void i915_pmu_register(struct drm_i915_private *i915)
>         i915->pmu.timer.function = i915_sample;
>         i915->pmu.enable = 0;
>  
> +       for_each_engine(engine, i915, id) {
> +               INIT_WORK(&engine->pmu.enable_busy_stats, __enable_busy_stats);
> +               INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats,
> +                                 __disable_busy_stats);
> +       }
> +
>         ret = perf_pmu_register(&i915->pmu.base, "i915", -1);
>         if (ret == 0)
>                 return;
> @@ -743,6 +784,9 @@ void i915_pmu_register(struct drm_i915_private *i915)
>  
>  void i915_pmu_unregister(struct drm_i915_private *i915)
>  {
> +       struct intel_engine_cs *engine;
> +       enum intel_engine_id id;
> +
>         if (!i915->pmu.base.event_init)
>                 return;
>  
> @@ -754,6 +798,11 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
>  
>         hrtimer_cancel(&i915->pmu.timer);
>  
> +       for_each_engine(engine, i915, id) {
GEM_BUG_ON(engine->pmu.busy_stats);

(Or a dose of PMU_BUG_ON?)

> +               flush_work(&engine->pmu.enable_busy_stats);
> +               flush_delayed_work(&engine->pmu.disable_busy_stats);

And then you only need to flush the disable_busy_stats, but no harm in
both.

Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris