[Intel-gfx] [PATCH v10 07/10] drm/i915: Gate engine stats collection with a static key

Tvrtko Ursulin tursulin at ursulin.net
Thu Oct 5 13:05:26 UTC 2017


From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

This reduces the cost of the software engine busyness tracking
to a single no-op instruction when there are no listeners.

We add a new i915 ordered workqueue to be used only for tasks
not needing struct mutex.

v2: Rebase and some comments.
v3: Rebase.
v4: Checkpatch fixes.
v5: Rebase.
v6: Use system_long_wq to avoid being blocked by struct_mutex
    users.
v7: Fix bad conflict resolution from last rebase. (Dmitry Rogozhkin)
v8: Rebase.
v9:
 * Fix race between unordered enable followed by disable.
   (Chris Wilson)
 * Prettify order of local variable declarations. (Chris Wilson)
v10:
 Chris Wilson:
  * Fix workqueue allocation failure handling.
  * Expand comment about critical workqueue ordering.
  * Add some GEM_BUG_ONs to document impossible conditions.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         |  13 +++-
 drivers/gpu/drm/i915/i915_drv.h         |   5 ++
 drivers/gpu/drm/i915/i915_pmu.c         |  65 ++++++++++++++++++--
 drivers/gpu/drm/i915/intel_engine_cs.c  |  17 ++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h | 101 ++++++++++++++++++++------------
 5 files changed, 158 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d373a840566a..3853038c6fdd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -804,12 +804,22 @@ static int i915_workqueues_init(struct drm_i915_private *dev_priv)
 	if (dev_priv->wq == NULL)
 		goto out_err;
 
+	/*
+	 * Ordered workqueue for tasks not needing the global mutex but
+	 * which still need to be serialized.
+	 */
+	dev_priv->wq_misc = alloc_ordered_workqueue("i915-misc", 0);
+	if (dev_priv->wq_misc == NULL)
+		goto out_free_wq;
+
 	dev_priv->hotplug.dp_wq = alloc_ordered_workqueue("i915-dp", 0);
 	if (dev_priv->hotplug.dp_wq == NULL)
-		goto out_free_wq;
+		goto out_free_wq_misc;
 
 	return 0;
 
+out_free_wq_misc:
+	destroy_workqueue(dev_priv->wq_misc);
 out_free_wq:
 	destroy_workqueue(dev_priv->wq);
 out_err:
@@ -830,6 +840,7 @@ static void i915_engines_cleanup(struct drm_i915_private *i915)
 static void i915_workqueues_cleanup(struct drm_i915_private *dev_priv)
 {
 	destroy_workqueue(dev_priv->hotplug.dp_wq);
+	destroy_workqueue(dev_priv->wq_misc);
 	destroy_workqueue(dev_priv->wq);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2dabe671bb50..efa3a8423cca 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2349,6 +2349,11 @@ struct drm_i915_private {
 	 */
 	struct workqueue_struct *wq;
 
+	/**
+	 * @wq_misc: Workqueue for serialized tasks not needing struct mutex.
+	 */
+	struct workqueue_struct *wq_misc;
+
 	/* Display functions */
 	struct drm_i915_display_funcs display;
 
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 2671e3640639..d552e415becd 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -458,11 +458,20 @@ static void i915_pmu_enable(struct perf_event *event)
 		GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
 		GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
 		if (engine->pmu.enable_count[sample]++ == 0) {
+			/*
+			 * Enable engine busy stats tracking if needed or
+			 * alternatively cancel the scheduled disabling of the
+			 * same.
+			 *
+			 * If the delayed disable was pending, cancel it and in
+			 * this case no need to queue a new enable.
+			 */
 			if (engine_needs_busy_stats(engine) &&
 			    !engine->pmu.busy_stats) {
-				engine->pmu.busy_stats =
-					intel_enable_engine_stats(engine) == 0;
-				WARN_ON_ONCE(!engine->pmu.busy_stats);
+				engine->pmu.busy_stats = true;
+				if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
+					queue_work(i915->wq_misc,
+						   &engine->pmu.enable_busy_stats);
 			}
 		}
 	}
@@ -505,7 +514,22 @@ static void i915_pmu_disable(struct perf_event *event)
 			if (!engine_needs_busy_stats(engine) &&
 			    engine->pmu.busy_stats) {
 				engine->pmu.busy_stats = false;
-				intel_disable_engine_stats(engine);
+				/*
+				 * We request a delayed disable to handle the
+				 * rapid on/off cycles on events, which can
+				 * happen when tools like perf stat start, in a
+				 * nicer way.
+				 *
+				 * PMU enable and disable callbacks are
+				 * serialized with the spinlock, and since we do
+				 * them in decoupled fashion from workers, it is
+				 * critical to use an ordered work queue to
+				 * preserve this ordering between the two
+				 * events.
+				 */
+				queue_delayed_work(i915->wq_misc,
+						   &engine->pmu.disable_busy_stats,
+						   round_jiffies_up_relative(HZ));
 			}
 		}
 	}
@@ -689,8 +713,26 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
+static void __enable_busy_stats(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), pmu.enable_busy_stats);
+
+	WARN_ON_ONCE(intel_enable_engine_stats(engine));
+}
+
+static void __disable_busy_stats(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+	       container_of(work, typeof(*engine), pmu.disable_busy_stats.work);
+
+	intel_disable_engine_stats(engine);
+}
+
 void i915_pmu_register(struct drm_i915_private *i915)
 {
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	int ret;
 
 	if (INTEL_GEN(i915) <= 2) {
@@ -725,6 +767,12 @@ void i915_pmu_register(struct drm_i915_private *i915)
 	i915->pmu.timer.function = i915_sample;
 	i915->pmu.enable = 0;
 
+	for_each_engine(engine, i915, id) {
+		INIT_WORK(&engine->pmu.enable_busy_stats, __enable_busy_stats);
+		INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats,
+				  __disable_busy_stats);
+	}
+
 	ret = perf_pmu_register(&i915->pmu.base, "i915", -1);
 	if (ret == 0)
 		return;
@@ -743,6 +791,9 @@ void i915_pmu_register(struct drm_i915_private *i915)
 
 void i915_pmu_unregister(struct drm_i915_private *i915)
 {
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
 	if (!i915->pmu.base.event_init)
 		return;
 
@@ -754,6 +805,12 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
 
 	hrtimer_cancel(&i915->pmu.timer);
 
+	for_each_engine(engine, i915, id) {
+		GEM_BUG_ON(engine->pmu.busy_stats);
+		GEM_BUG_ON(flush_work(&engine->pmu.enable_busy_stats));
+		flush_delayed_work(&engine->pmu.disable_busy_stats);
+	}
+
 	perf_pmu_unregister(&i915->pmu.base);
 	i915->pmu.base.event_init = NULL;
 }
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 53d3b8f3c222..cbfde1149822 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -21,6 +21,7 @@
  * IN THE SOFTWARE.
  *
  */
+#include <linux/static_key.h>
 
 #include "i915_drv.h"
 #include "intel_ringbuffer.h"
@@ -1653,6 +1654,10 @@ intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance)
 	return i915->engine_class[class][instance];
 }
 
+DEFINE_STATIC_KEY_FALSE(i915_engine_stats_key);
+static DEFINE_MUTEX(i915_engine_stats_mutex);
+static int i915_engine_stats_ref;
+
 /**
  * intel_enable_engine_stats() - Enable engine busy tracking on engine
  * @engine: engine to enable stats collection
@@ -1668,6 +1673,8 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
 	if (!i915_modparams.enable_execlists)
 		return -ENODEV;
 
+	mutex_lock(&i915_engine_stats_mutex);
+
 	spin_lock_irqsave(&engine->stats.lock, flags);
 	if (engine->stats.enabled == ~0)
 		goto busy;
@@ -1675,10 +1682,16 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
 		engine->stats.enabled_at = ktime_get();
 	spin_unlock_irqrestore(&engine->stats.lock, flags);
 
+	if (i915_engine_stats_ref++ == 0)
+		static_branch_enable(&i915_engine_stats_key);
+
+	mutex_unlock(&i915_engine_stats_mutex);
+
 	return 0;
 
 busy:
 	spin_unlock_irqrestore(&engine->stats.lock, flags);
+	mutex_unlock(&i915_engine_stats_mutex);
 
 	return -EBUSY;
 }
@@ -1696,6 +1709,7 @@ void intel_disable_engine_stats(struct intel_engine_cs *engine)
 	if (!i915_modparams.enable_execlists)
 		return;
 
+	mutex_lock(&i915_engine_stats_mutex);
 	spin_lock_irqsave(&engine->stats.lock, flags);
 	WARN_ON_ONCE(engine->stats.enabled == 0);
 	if (--engine->stats.enabled == 0) {
@@ -1705,6 +1719,9 @@ void intel_disable_engine_stats(struct intel_engine_cs *engine)
 		engine->stats.total = 0;
 	}
 	spin_unlock_irqrestore(&engine->stats.lock, flags);
+	if (--i915_engine_stats_ref == 0)
+		static_branch_disable(&i915_engine_stats_key);
+	mutex_unlock(&i915_engine_stats_mutex);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e61ab4f3dc90..adcc79d2e414 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -361,6 +361,22 @@ struct intel_engine_cs {
 		 * 		requested.
 		 */
 		bool busy_stats;
+		/**
+		 * @enable_busy_stats: Work item for engine busy stats enabling.
+		 *
+		 * Since the action can sleep it needs to be decoupled from the
+		 * perf API callback.
+		 */
+		struct work_struct enable_busy_stats;
+		/**
+		 * @disable_busy_stats: Work item for busy stats disabling.
+		 *
+		 * Same as with @enable_busy_stats action, with the difference
+		 * that we delay it in case there are rapid enable-disable
+		 * actions, which can happen during tool startup (like perf
+		 * stat).
+		 */
+		struct delayed_work disable_busy_stats;
 	} pmu;
 
 	/*
@@ -902,59 +918,68 @@ bool intel_engine_can_store_dword(struct intel_engine_cs *engine);
 struct intel_engine_cs *
 intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance);
 
+DECLARE_STATIC_KEY_FALSE(i915_engine_stats_key);
+
 static inline void intel_engine_context_in(struct intel_engine_cs *engine)
 {
 	unsigned long flags;
 
-	if (READ_ONCE(engine->stats.enabled) == 0)
-		return;
+	if (static_branch_unlikely(&i915_engine_stats_key)) {
+		if (READ_ONCE(engine->stats.enabled) == 0)
+			return;
 
-	spin_lock_irqsave(&engine->stats.lock, flags);
+		spin_lock_irqsave(&engine->stats.lock, flags);
 
-	if (engine->stats.enabled > 0) {
-		if (engine->stats.active++ == 0)
-			engine->stats.start = ktime_get();
-		GEM_BUG_ON(engine->stats.active == 0);
-	}
+			if (engine->stats.enabled > 0) {
+				if (engine->stats.active++ == 0)
+					engine->stats.start = ktime_get();
+				GEM_BUG_ON(engine->stats.active == 0);
+			}
 
-	spin_unlock_irqrestore(&engine->stats.lock, flags);
+		spin_unlock_irqrestore(&engine->stats.lock, flags);
+	}
 }
 
 static inline void intel_engine_context_out(struct intel_engine_cs *engine)
 {
 	unsigned long flags;
 
-	if (READ_ONCE(engine->stats.enabled) == 0)
-		return;
-
-	spin_lock_irqsave(&engine->stats.lock, flags);
-
-	if (engine->stats.enabled > 0) {
-		ktime_t last, now = ktime_get();
-
-		if (engine->stats.active && --engine->stats.active == 0) {
-			/*
-			 * Decrement the active context count and in case GPU
-			 * is now idle add up to the running total.
-			 */
-			last = ktime_sub(now, engine->stats.start);
-
-			engine->stats.total = ktime_add(engine->stats.total,
-							last);
-		} else if (engine->stats.active == 0) {
-			/*
-			 * After turning on engine stats, context out might be
-			 * the first event in which case we account from the
-			 * time stats gathering was turned on.
-			 */
-			last = ktime_sub(now, engine->stats.enabled_at);
-
-			engine->stats.total = ktime_add(engine->stats.total,
-							last);
+	if (static_branch_unlikely(&i915_engine_stats_key)) {
+		if (READ_ONCE(engine->stats.enabled) == 0)
+			return;
+
+		spin_lock_irqsave(&engine->stats.lock, flags);
+
+		if (engine->stats.enabled > 0) {
+			ktime_t last, now = ktime_get();
+
+			if (engine->stats.active &&
+			    --engine->stats.active == 0) {
+				/*
+				 * Decrement the active context count and in
+				 * case GPU is now idle add up to the running
+				 * total.
+				 */
+				last = ktime_sub(now, engine->stats.start);
+
+				engine->stats.total =
+					ktime_add(engine->stats.total, last);
+			} else if (engine->stats.active == 0) {
+				/*
+				 * After turning on engine stats, context out
+				 * might be the first event in which case we
+				 * account from the time stats gathering was
+				 * turned on.
+				 */
+				last = ktime_sub(now, engine->stats.enabled_at);
+
+				engine->stats.total =
+					ktime_add(engine->stats.total, last);
+			}
 		}
-	}
 
-	spin_unlock_irqrestore(&engine->stats.lock, flags);
+		spin_unlock_irqrestore(&engine->stats.lock, flags);
+	}
 }
 
 int intel_enable_engine_stats(struct intel_engine_cs *engine);
-- 
2.9.5



More information about the Intel-gfx mailing list