[Intel-gfx] [PATCH 07/10] drm/i915: Gate engine stats collection with a static key
Chris Wilson
chris at chris-wilson.co.uk
Tue Oct 3 10:17:25 UTC 2017
Quoting Tvrtko Ursulin (2017-09-29 13:34:57)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> This reduces the cost of the software engine busyness tracking
> to a single no-op instruction when there are no listeners.
>
> We add a new i915 ordered workqueue to be used only for tasks
> not needing struct mutex.
>
> v2: Rebase and some comments.
> v3: Rebase.
> v4: Checkpatch fixes.
> v5: Rebase.
> v6: Use system_long_wq to avoid being blocked by struct_mutex
> users.
> v7: Fix bad conflict resolution from last rebase. (Dmitry Rogozhkin)
> v8: Rebase.
> v9:
> * Fix race between unordered enable followed by disable.
> (Chris Wilson)
> * Prettify order of local variable declarations. (Chris Wilson)
Ok, I can't see a downside to enabling the optimisation even if it will
be global and not per-device/per-engine.
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.c | 13 +++-
> drivers/gpu/drm/i915/i915_drv.h | 5 ++
> drivers/gpu/drm/i915/i915_pmu.c | 57 ++++++++++++++++--
> drivers/gpu/drm/i915/intel_engine_cs.c | 17 ++++++
> drivers/gpu/drm/i915/intel_ringbuffer.h | 101 ++++++++++++++++++++------------
> 5 files changed, 150 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 187c0fad1b79..1e43d594b59a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -795,12 +795,22 @@ static int i915_workqueues_init(struct drm_i915_private *dev_priv)
> if (dev_priv->wq == NULL)
> goto out_err;
>
> + /*
> + * Ordered workqueue for tasks not needing the global mutex but
> + * which still need to be serialized.
> + */
> + dev_priv->wq_misc = alloc_ordered_workqueue("i915-misc", 0);
> + if (dev_priv->wq == NULL)
if (!dev_priv->wq_misc) I think you mean ;)
> + goto out_free_wq;
Ok, Tried a few other names, liked none better.
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 93c0e7ec7d75..4c9b9ae09b23 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -458,11 +458,20 @@ static void i915_pmu_enable(struct perf_event *event)
> GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
> GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
> if (engine->pmu.enable_count[sample]++ == 0) {
> + /*
> + * Enable engine busy stats tracking if needed or
> + * alternatively cancel the scheduled disabling of the
> + * same.
> + *
> + * If the delayed disable was pending, cancel it and in
> + * this case no need to queue a new enable.
> + */
> if (engine_needs_busy_stats(engine) &&
> !engine->pmu.busy_stats) {
> - engine->pmu.busy_stats =
> - intel_enable_engine_stats(engine) == 0;
> - WARN_ON_ONCE(!engine->pmu.busy_stats);
> + engine->pmu.busy_stats = true;
> + if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
> + queue_work(i915->wq_misc,
> + &engine->pmu.enable_busy_stats);
> }
> }
> }
> @@ -505,7 +514,15 @@ static void i915_pmu_disable(struct perf_event *event)
> if (!engine_needs_busy_stats(engine) &&
> engine->pmu.busy_stats) {
> engine->pmu.busy_stats = false;
> - intel_disable_engine_stats(engine);
> + /*
> + * We request a delayed disable to handle the
> + * rapid on/off cycles on events which can
> + * happen when tools like perf stat start in a
> + * nicer way.
> + */
Ah, so this is where the work ordering plays a key role. A comment that
we depend on strict ordering within the workqueue to prevent the delayed
disable overtaking the enable worker (even when the workqueue jumps
between cpus).
> + queue_delayed_work(i915->wq_misc,
> + &engine->pmu.disable_busy_stats,
> + round_jiffies_up_relative(HZ));
> }
> }
> }
> @@ -689,8 +706,26 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
> return 0;
> }
>
> +static void __enable_busy_stats(struct work_struct *work)
> +{
> + struct intel_engine_cs *engine =
> + container_of(work, typeof(*engine), pmu.enable_busy_stats);
> +
> + WARN_ON_ONCE(intel_enable_engine_stats(engine));
> +}
> +
> +static void __disable_busy_stats(struct work_struct *work)
> +{
> + struct intel_engine_cs *engine =
> + container_of(work, typeof(*engine), pmu.disable_busy_stats.work);
> +
> + intel_disable_engine_stats(engine);
> +}
> +
> void i915_pmu_register(struct drm_i915_private *i915)
> {
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> int ret;
>
> if (INTEL_GEN(i915) <= 2) {
> @@ -725,6 +760,12 @@ void i915_pmu_register(struct drm_i915_private *i915)
> i915->pmu.timer.function = i915_sample;
> i915->pmu.enable = 0;
>
> + for_each_engine(engine, i915, id) {
> + INIT_WORK(&engine->pmu.enable_busy_stats, __enable_busy_stats);
> + INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats,
> + __disable_busy_stats);
> + }
> +
> ret = perf_pmu_register(&i915->pmu.base, "i915", -1);
> if (ret == 0)
> return;
> @@ -743,6 +784,9 @@ void i915_pmu_register(struct drm_i915_private *i915)
>
> void i915_pmu_unregister(struct drm_i915_private *i915)
> {
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> +
> if (!i915->pmu.base.event_init)
> return;
>
> @@ -754,6 +798,11 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
>
> hrtimer_cancel(&i915->pmu.timer);
>
> + for_each_engine(engine, i915, id) {
GEM_BUG_ON(engine->pmu.busy_stats);
(Or a dose of PMU_BUG_ON?)
> + flush_work(&engine->pmu.enable_busy_stats);
> + flush_delayed_work(&engine->pmu.disable_busy_stats);
And then you only need to flush the disable_busy_stats, but no harm in
both.
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris
More information about the Intel-gfx
mailing list