[Intel-gfx] [PATCH 24/24] RFC drm/i915: Expose a PMU interface for perf queries

Fri May 19 08:01:09 UTC 2017

On Thu, May 18, 2017 at 04:48:47PM -0700, Dmitry Rogozhkin wrote:
> 
> 
> On 5/18/2017 2:46 AM, Chris Wilson wrote:
> >The first goal is to be able to measure GPU (and invidual ring) busyness
> >without having to poll registers from userspace. (Which not only incurs
> >holding the forcewake lock indefinitely, perturbing the system, but also
> >runs the risk of hanging the machine.) As an alternative we can use the
> >perf event counter interface to sample the ring registers periodically
> >and send those results to userspace.
> >
> >To be able to do so, we need to export the two symbols from
> >kernel/events/core.c to register and unregister a PMU device.
> >
> >v2: Use a common timer for the ring sampling.
> >
> >Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/Makefile           |   1 +
> >  drivers/gpu/drm/i915/i915_drv.c         |   2 +
> >  drivers/gpu/drm/i915/i915_drv.h         |  23 ++
> >  drivers/gpu/drm/i915/i915_pmu.c         | 449 ++++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
> >  include/uapi/drm/i915_drm.h             |  40 +++
> >  kernel/events/core.c                    |   1 +
> >  7 files changed, 518 insertions(+)
> >  create mode 100644 drivers/gpu/drm/i915/i915_pmu.c
> >
> >diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> >index 7b05fb802f4c..ca88e6e67910 100644
> >--- a/drivers/gpu/drm/i915/Makefile
> >+++ b/drivers/gpu/drm/i915/Makefile
> >@@ -26,6 +26,7 @@ i915-y := i915_drv.o \
> >  i915-$(CONFIG_COMPAT)   += i915_ioc32.o
> >  i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o intel_pipe_crc.o
> >+i915-$(CONFIG_PERF_EVENTS) += i915_pmu.o
> >  # GEM code
> >  i915-y += i915_cmd_parser.o \
> >diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> >index 2d2fb4327f97..e3c6d052d1c9 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.c
> >+++ b/drivers/gpu/drm/i915/i915_drv.c
> >@@ -1144,6 +1144,7 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
> >  	struct drm_device *dev = &dev_priv->drm;
> >  	i915_gem_shrinker_init(dev_priv);
> >+	i915_pmu_register(dev_priv);
> >  	/*
> >  	 * Notify a valid surface after modesetting,
> >@@ -1197,6 +1198,7 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
> >  	intel_opregion_unregister(dev_priv);
> >  	i915_perf_unregister(dev_priv);
> >+	i915_pmu_unregister(dev_priv);
> >  	i915_teardown_sysfs(dev_priv);
> >  	i915_guc_log_unregister(dev_priv);
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 1fa1e7d48f02..10beae1a13c8 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -40,6 +40,7 @@
> >  #include <linux/hash.h>
> >  #include <linux/intel-iommu.h>
> >  #include <linux/kref.h>
> >+#include <linux/perf_event.h>
> >  #include <linux/pm_qos.h>
> >  #include <linux/reservation.h>
> >  #include <linux/shmem_fs.h>
> >@@ -2075,6 +2076,12 @@ struct intel_cdclk_state {
> >  	unsigned int cdclk, vco, ref;
> >  };
> >+enum {
> >+	__I915_SAMPLE_FREQ_ACT = 0,
> >+	__I915_SAMPLE_FREQ_REQ,
> >+	__I915_NUM_PMU_SAMPLERS
> >+};
> >+
> >  struct drm_i915_private {
> >  	struct drm_device drm;
> >@@ -2564,6 +2571,13 @@ struct drm_i915_private {
> >  		int	irq;
> >  	} lpe_audio;
> >+	struct {
> >+		struct pmu base;
> >+		struct hrtimer timer;
> >+		u64 enable;
> >+		u64 sample[__I915_NUM_PMU_SAMPLERS];
> >+	} pmu;
> >+
> >  	/*
> >  	 * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
> >  	 * will be rejected. Instead look for a better place.
> >@@ -3681,6 +3695,15 @@ extern void i915_perf_fini(struct drm_i915_private *dev_priv);
> >  extern void i915_perf_register(struct drm_i915_private *dev_priv);
> >  extern void i915_perf_unregister(struct drm_i915_private *dev_priv);
> >+/* i915_pmu.c */
> >+#ifdef CONFIG_PERF_EVENTS
> >+extern void i915_pmu_register(struct drm_i915_private *i915);
> >+extern void i915_pmu_unregister(struct drm_i915_private *i915);
> >+#else
> >+static inline void i915_pmu_register(struct drm_i915_private *i915) {}
> >+static inline void i915_pmu_unregister(struct drm_i915_private *i915) {}
> >+#endif
> >+
> >  /* i915_suspend.c */
> >  extern int i915_save_state(struct drm_i915_private *dev_priv);
> >  extern int i915_restore_state(struct drm_i915_private *dev_priv);
> >diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> >new file mode 100644
> >index 000000000000..80e1c07841ac
> >--- /dev/null
> >+++ b/drivers/gpu/drm/i915/i915_pmu.c
> >@@ -0,0 +1,449 @@
> >+#include <linux/perf_event.h>
> >+#include <linux/pm_runtime.h>
> >+
> >+#include "i915_drv.h"
> >+#include "intel_ringbuffer.h"
> >+
> >+#define FREQUENCY 200
> >+#define PERIOD max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY)
> >+
> >+#define RING_MASK 0xffffffff
> >+#define RING_MAX 32
> >+
> >+static void engines_sample(struct drm_i915_private *dev_priv)
> >+{
> >+	struct intel_engine_cs *engine;
> >+	enum intel_engine_id id;
> >+	bool fw = false;
> >+
> >+	if ((dev_priv->pmu.enable & RING_MASK) == 0)
> >+		return;
> >+
> >+	if (!dev_priv->gt.awake)
> >+		return;
> >+
> >+	if (!intel_runtime_pm_get_if_in_use(dev_priv))
> >+		return;
> >+
> >+	for_each_engine(engine, dev_priv, id) {
> >+		u32 val;
> >+
> >+		if ((dev_priv->pmu.enable & (0x7 << (4*id))) == 0)
> >+			continue;
> >+
> >+		if (i915_seqno_passed(intel_engine_get_seqno(engine),
> >+				      intel_engine_last_submit(engine)))
> >+			continue;
> >+
> >+		if (!fw) {
> >+			intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> >+			fw = true;
> >+		}
> >+
> >+		val = I915_READ_FW(RING_MI_MODE(engine->mmio_base));
> >+		if (!(val & MODE_IDLE))
> >+			engine->pmu_sample[I915_SAMPLE_BUSY] += PERIOD;
> Could you, please, check that I understand correctly what you
> attempt to do? Each PERIOD=10ms you check the status of the engines.
> If engine is in a busy state you count the PERIOD time as if the
> engine was busy during this period. If engine was in some other
> state you count this toward this state (wait or sema below as I
> understand).
> 
> If that's what you are trying to do, your solution can be somewhat
> used only for the cases where tasks are executed on the engines more
> then 2x PERIOD time, i.e. longer than 20ms.

It's a statistical sampling, same as any other. You have control over
the period, or at least perf does give you control (I may have not
hooked it up correctly). I set a totally arbitrary cap of 20us.

> Which is not the case
> for a lot of tasks submitted by media with the usual execution time
> of just few milliseconds.

If you are only issuing one batch lasting 1ms every second, the load is
more or less immaterial. 

> Considering that i915 currently tracks
> when batches were scheduled and when they are ready, you can easily
> calculate very strong precise metric of busy clocks for each engine
> from the perspective of i915, i.e. that will not be precise time of
> how long engine calculated things because it will include scheduling
> latency, but that is exactly what end-user requires: end-user does
> not care whether there is a latency or there is not, for him engine
> is busy all that time.

Which you can do yourself. I am not that interested in repeating
metrics already available to userspace. Furthermore, coupled in with the
statiscal sampling of perf, perf also provides access to the timestamps
of when each request is constructed, queued, executed, completed and
retired; and waited upon along with a host of other tracepoints.

> This is done in the alternate solution given
> by Tvrtko in "drm/i915: Export engine busy stats in debugfs" patch.
> Why you do not take this as a basis? Why this patch containing few
> arithmetic operations to calculate engines busy clocks is ignored?

Because this approach is independent of platform and plugs into the
existing perf infrastructure. And you need a far more convincing
argument to have an execlist-only code running inside interrupt
handlers. Tvrtko said himself that maintainability is paramount.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre