[Intel-gfx] [RFC v4 2/5] drm/i915/pmu: serve global events and support perf stat

Fri Sep 1 15:56:29 UTC 2017

This patch should probably be squashed with Tvrtko's PMU enabling
patch...

Making perf-stat workable with i915 PMU. The major point is that
current implementation of i915 PMU exposes global counter rather
thatn per-task counters. Thus, required changes are:
* Register PMU with .task_ctx_nr=perf_invalid_context
* Expose cpumask for the PMU with the single CPU in the mask
* Properly support pmu->stop(): it should call pmu->read()
* Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE)

Examples:
perf stat -e i915/rcs0-busy/ workload.sh
   This should error out with "failed to read counter" because the
requested counter can't be queried in per-task mode (which is the case).

perf stat -e i915/rcs0-busy/ -a workload.sh
perf stat -e i915/rcs0-busy/ -a -C0 workload.sh
   These 2 commands should give the same result with the correct counter
values. Counter value will be queried once in the end of the wrokload.
The example output would be:

 Performance counter stats for 'system wide':
       649,547,987      i915/rcs0-busy/

       1.895530680 seconds time elapsed

perf stat -e i915/rcs0-busy/ -a -I 100 workload.sh
   This will query counter perdiodically (each 100ms) and dump output:

     0.100108369          4,137,438      i915/rcs0-busy/
i915/rcs0-busy/: 37037414 100149071 100149071
     0.200249024         37,037,414      i915/rcs0-busy/
i915/rcs0-busy/: 36935429 100145077 100145077
     0.300391916         36,935,429      i915/rcs0-busy/
i915/rcs0-busy/: 34262017 100126136 100126136
     0.400518037         34,262,017      i915/rcs0-busy/
i915/rcs0-busy/: 34539960 100126217 100126217

v1: Make pmu.busy_stats a refcounter to avoid busy stats going away
    with some deleted event.

v2: Expose cpumask for i915 PMU to avoid multiple events creation of
    the same type followed by counter aggregation by perf-stat.

v3: Track CPUs getting online/offline to migrate perf context. If (likely)
    cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be
    needed to see effect of CPU status tracking.

Change-Id: I7d1abe747a4399196e72253f7b66441a6528dbee
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Peter Zijlstra <peterz at infradead.org>

cpumask

Change-Id: I145f59240b75f2b703e0531ec81af6cd05aae95c
Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  19 +++---
 drivers/gpu/drm/i915/i915_pmu.c         | 115 +++++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +-
 3 files changed, 110 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b59da2c..e629e5e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2153,6 +2153,16 @@ enum {
 	__I915_NUM_PMU_SAMPLERS
 };
 
+struct i915_pmu {
+	struct hlist_node node;
+	struct pmu base;
+	spinlock_t lock;
+	struct hrtimer timer;
+	bool timer_enabled;
+	u64 enable;
+	u64 sample[__I915_NUM_PMU_SAMPLERS];
+};
+
 struct drm_i915_private {
 	struct drm_device drm;
 
@@ -2660,14 +2670,7 @@ struct drm_i915_private {
 		int	irq;
 	} lpe_audio;
 
-	struct {
-		struct pmu base;
-		spinlock_t lock;
-		struct hrtimer timer;
-		bool timer_enabled;
-		u64 enable;
-		u64 sample[__I915_NUM_PMU_SAMPLERS];
-	} pmu;
+	struct i915_pmu pmu;
 
 	/*
 	 * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index bcdf2bc..7bfedb7 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -15,6 +15,8 @@
 
 #define ENGINE_SAMPLE_BITS (16)
 
+static cpumask_t i915_pmu_cpumask = CPU_MASK_NONE;
+
 static u8 engine_config_sample(u64 config)
 {
 	return config & I915_PMU_SAMPLE_MASK;
@@ -285,16 +287,24 @@ static int i915_pmu_event_init(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), pmu.base);
+	int cpu;
 	int ret;
 
-	/* XXX ideally only want pid == -1 && cpu == -1 */
-
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
 	if (has_branch_stack(event))
 		return -EOPNOTSUPP;
 
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	cpu = cpumask_any_and(&i915_pmu_cpumask,
+		topology_sibling_cpumask(event->cpu));
+
+	if (cpu >= nr_cpu_ids)
+		return -ENODEV;
+
 	ret = 0;
 	if (is_engine_event(event)) {
 		ret = engine_event_init(event);
@@ -314,6 +324,7 @@ static int i915_pmu_event_init(struct perf_event *event)
 	if (ret)
 		return ret;
 
+	event->cpu = cpu;
 	if (!event->parent)
 		event->destroy = i915_pmu_event_destroy;
 
@@ -469,11 +480,16 @@ static void i915_pmu_enable(struct perf_event *event)
 				       ns_to_ktime(PERIOD), 0,
 				       HRTIMER_MODE_REL_PINNED);
 		i915->pmu.timer_enabled = true;
-	} else if (is_engine_event(event) && engine_needs_busy_stats(engine) &&
-		   !engine->pmu.busy_stats) {
-		engine->pmu.busy_stats = true;
-		if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
-			queue_work(i915->wq, &engine->pmu.enable_busy_stats);
+	} else if (is_engine_event(event) && engine_needs_busy_stats(engine)) {
+		/* Enable busy stats for the first event demanding it, next
+		 * events just reference counter. So, if some events will go
+		 * away, we will still have busy stats enabled till remaining
+		 * events still use them.
+		 */
+		if (engine->pmu.busy_stats++ == 0) {
+			if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
+				queue_work(i915->wq, &engine->pmu.enable_busy_stats);
+		}
 	}
 
 	spin_unlock_irqrestore(&i915->pmu.lock, flags);
@@ -495,16 +511,16 @@ static void i915_pmu_disable(struct perf_event *event)
 		enum intel_engine_id id;
 
 		engine = intel_engine_lookup_user(i915,
-						  engine_event_class(event),
-						  engine_event_instance(event));
+						engine_event_class(event),
+						engine_event_instance(event));
 		GEM_BUG_ON(!engine);
 		engine->pmu.enable &= ~BIT(engine_event_sample(event));
-		if (engine->pmu.busy_stats &&
-		    !engine_needs_busy_stats(engine)) {
-			engine->pmu.busy_stats = false;
-			queue_delayed_work(i915->wq,
-					   &engine->pmu.disable_busy_stats,
-					   round_jiffies_up_relative(2 * HZ));
+		if (!engine_needs_busy_stats(engine)) {
+			if (engine->pmu.busy_stats && --engine->pmu.busy_stats == 0) {
+				queue_delayed_work(i915->wq,
+						&engine->pmu.disable_busy_stats,
+						round_jiffies_up_relative(2 * HZ));
+			}
 		}
 		mask = 0;
 		for_each_engine(engine, i915, id)
@@ -529,6 +545,8 @@ static void i915_pmu_event_start(struct perf_event *event, int flags)
 
 static void i915_pmu_event_stop(struct perf_event *event, int flags)
 {
+	if (flags & PERF_EF_UPDATE)
+		i915_pmu_event_read(event);
 	i915_pmu_disable(event);
 }
 
@@ -546,7 +564,7 @@ static int i915_pmu_event_add(struct perf_event *event, int flags)
 
 static void i915_pmu_event_del(struct perf_event *event, int flags)
 {
-	i915_pmu_disable(event);
+	i915_pmu_event_stop(event, PERF_EF_UPDATE);
 }
 
 static int i915_pmu_event_event_idx(struct perf_event *event)
@@ -656,9 +674,28 @@ static ssize_t i915_pmu_event_show(struct device *dev,
         .attrs = i915_pmu_events_attrs,
 };
 
+static ssize_t i915_pmu_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &i915_pmu_cpumask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, i915_pmu_get_attr_cpumask, NULL);
+
+static struct attribute *i915_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group i915_pmu_cpumask_attr_group = {
+	.attrs = i915_cpumask_attrs,
+};
+
 static const struct attribute_group *i915_pmu_attr_groups[] = {
         &i915_pmu_format_attr_group,
         &i915_pmu_events_attr_group,
+        &i915_pmu_cpumask_attr_group,
         NULL
 };
 
@@ -678,16 +715,57 @@ static void __disable_busy_stats(struct work_struct *work)
 	intel_disable_engine_stats(engine);
 }
 
+static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	unsigned int target;
+
+	target = cpumask_any_and(&i915_pmu_cpumask, &i915_pmu_cpumask);
+	/* Select the first online CPU as a designated reader. */
+	if (target >= nr_cpu_ids) {
+		cpumask_set_cpu(cpu, &i915_pmu_cpumask);
+	}
+	return 0;
+}
+
+static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), node);
+	unsigned int target;
+
+	if (cpumask_test_and_clear_cpu(cpu, &i915_pmu_cpumask)) {
+		target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+		/* Migrate events if there is a valid target */
+		if (target < nr_cpu_ids) {
+			cpumask_set_cpu(target, &i915_pmu_cpumask);
+			perf_pmu_migrate_context(&pmu->base, cpu, target);
+		}
+	}
+	return 0;
+}
+
 void i915_pmu_register(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
+	int err;
 
 	if (INTEL_GEN(i915) <= 2)
 		return;
 
+	err = cpuhp_setup_state_multi(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+		"perf/x86/intel/i915:online",
+		i915_pmu_cpu_online,
+		i915_pmu_cpu_offline);
+	if (err)
+		return;
+
+	err = cpuhp_state_add_instance(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+		&i915->pmu.node);
+	if (err)
+		return;
+
 	i915->pmu.base.attr_groups	= i915_pmu_attr_groups;
-	i915->pmu.base.task_ctx_nr	= perf_sw_context;
+	i915->pmu.base.task_ctx_nr	= perf_invalid_context;
 	i915->pmu.base.event_init	= i915_pmu_event_init;
 	i915->pmu.base.add		= i915_pmu_event_add;
 	i915->pmu.base.del		= i915_pmu_event_del;
@@ -731,4 +809,7 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
 		flush_work(&engine->pmu.enable_busy_stats);
 		flush_delayed_work(&engine->pmu.disable_busy_stats);
 	}
+
+	cpuhp_state_remove_instance(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+		&i915->pmu.node);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index fd5d838..0530e4e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -248,7 +248,7 @@ struct intel_engine_cs {
 	struct {
 		u32 enable;
 		u64 sample[4];
-		bool busy_stats;
+		unsigned int busy_stats;
 		struct work_struct enable_busy_stats;
 		struct delayed_work disable_busy_stats;
 	} pmu;
-- 
1.8.3.1