[Intel-gfx] [PATCH v2] drm/i915/pmu: Measure sampler intervals
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed May 30 14:37:18 UTC 2018
On 30/05/2018 12:55, Chris Wilson wrote:
> hrtimer is not reliable enough to assume fixed intervals, and so even
> coarse accuracy (in the face of kasan and similar heavy debugging) we
> need to measure the actual interval between sample.
>
> While using a single timestamp to compute the interval does not allow
> very fine accuracy (consider the impact of a slow forcewake between
> different samples after the timestamp is read) is much better than
> assuming the interval.
>
> v2: Make use of the sample period for estimating the GPU clock cycles,
> leaving the frequency calculation (the averaging) to the caller.
> Introduce new samplers for reporting cycles instead of Hz.
>
> Testcase: igt/perf_pmu #ivb
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++-------
> drivers/gpu/drm/i915/i915_pmu.h | 6 +++++
> include/uapi/drm/i915_drm.h | 2 ++
> 3 files changed, 43 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index dc87797db500..12033e47e3b4 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active)
> */
> enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) |
> config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) |
> + config_enabled_mask(I915_PMU_ACTUAL_CLOCK) |
> + config_enabled_mask(I915_PMU_REQUESTED_CLOCK) |
> ENGINE_SAMPLE_MASK;
>
> /*
> @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915)
> {
> if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) {
> i915->pmu.timer_enabled = true;
> + i915->pmu.timestamp = ktime_get();
> hrtimer_start_range_ns(&i915->pmu.timer,
> ns_to_ktime(PERIOD), 0,
> HRTIMER_MODE_REL_PINNED);
> @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val)
> sample->cur += mul_u32_u32(val, unit);
> }
>
> -static void engines_sample(struct drm_i915_private *dev_priv)
> +static void engines_sample(struct drm_i915_private *dev_priv, u64 period)
> {
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv)
> val = !i915_seqno_passed(current_seqno, last_seqno);
>
> update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY],
> - PERIOD, val);
> + period, val);
>
> if (val && (engine->pmu.enable &
> (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) {
> @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv)
> }
>
> update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT],
> - PERIOD, !!(val & RING_WAIT));
> + period, !!(val & RING_WAIT));
>
> update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA],
> - PERIOD, !!(val & RING_WAIT_SEMAPHORE));
> + period, !!(val & RING_WAIT_SEMAPHORE));
> }
>
> if (fw)
> @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv)
> intel_runtime_pm_put(dev_priv);
> }
>
> -static void frequency_sample(struct drm_i915_private *dev_priv)
> +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period)
> {
> if (dev_priv->pmu.enable &
> - config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) {
> + (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) |
> + config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) {
> u32 val;
>
> val = dev_priv->gt_pm.rps.cur_freq;
> @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv)
>
> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT],
> 1, intel_gpu_freq(dev_priv, val));
> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT],
> + period, intel_gpu_freq(dev_priv, val));
Cache intel_gpu_freq in a local.
> }
>
> if (dev_priv->pmu.enable &
> - config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) {
> + (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) |
> + config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) {
> update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1,
> intel_gpu_freq(dev_priv,
> dev_priv->gt_pm.rps.cur_freq));
> + update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ],
> + period,
> + intel_gpu_freq(dev_priv,
> + dev_priv->gt_pm.rps.cur_freq));
Same here.
> }
> }
>
> @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
> {
> struct drm_i915_private *i915 =
> container_of(hrtimer, struct drm_i915_private, pmu.timer);
> + ktime_t now, period;
>
> if (!READ_ONCE(i915->pmu.timer_enabled))
> return HRTIMER_NORESTART;
>
> - engines_sample(i915);
> - frequency_sample(i915);
> + now = ktime_get();
> + period = ktime_sub(now, i915->pmu.timestamp);
> + i915->pmu.timestamp = now;
> +
> + engines_sample(i915, period);
> + frequency_sample(i915, period);
>
> hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD));
> return HRTIMER_RESTART;
> @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config)
> {
> switch (config) {
> case I915_PMU_ACTUAL_FREQUENCY:
> + case I915_PMU_ACTUAL_CLOCK:
> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
> /* Requires a mutex for sampling! */
> return -ENODEV;
> /* Fall-through. */
> case I915_PMU_REQUESTED_FREQUENCY:
> + case I915_PMU_REQUESTED_CLOCK:
> if (INTEL_GEN(i915) < 6)
> return -ENODEV;
> break;
> @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
> div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur,
> FREQUENCY);
> break;
> + case I915_PMU_ACTUAL_CLOCK:
> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur;
> + break;
> + case I915_PMU_REQUESTED_CLOCK:
> + val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur;
> + break;
> case I915_PMU_INTERRUPTS:
> val = count_interrupts(i915);
> break;
> @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915)
> __event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"),
> __event(I915_PMU_INTERRUPTS, "interrupts", NULL),
> __event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
> + __event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"),
> + __event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"),
> };
> static const struct {
> enum drm_i915_pmu_engine_sample sample;
> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
> index 2ba735299f7c..9c4252d85b5e 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.h
> +++ b/drivers/gpu/drm/i915/i915_pmu.h
> @@ -17,6 +17,8 @@ struct drm_i915_private;
> enum {
> __I915_SAMPLE_FREQ_ACT = 0,
> __I915_SAMPLE_FREQ_REQ,
> + __I915_SAMPLE_CLOCK_ACT,
> + __I915_SAMPLE_CLOCK_REQ,
> __I915_SAMPLE_RC6,
> __I915_SAMPLE_RC6_ESTIMATED,
> __I915_NUM_PMU_SAMPLERS
> @@ -52,6 +54,10 @@ struct i915_pmu {
> * @timer: Timer for internal i915 PMU sampling.
> */
> struct hrtimer timer;
> + /**
> + * @timestamp: Timestamp of last internal i915 PMU sampling.
> + */
> + ktime_t timestamp;
> /**
> * @enable: Bitmask of all currently enabled events.
> *
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 7f5634ce8e88..61ab71986274 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample {
> #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1)
> #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2)
> #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3)
> +#define I915_PMU_ACTUAL_CLOCK __I915_PMU_OTHER(4)
> +#define I915_PMU_REQUESTED_CLOCK __I915_PMU_OTHER(5)
>
> #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY
Bump this one.
I want to know if we could get away without introducing a new pair of
counter. For instance would running average of a period do for frequency
readout? It depends on what kind of error we are facing. Or a moving
average for some period? I would explore that but don't have an
Ivybridge so could you have a look?
Regards,
Tvrtko
More information about the Intel-gfx
mailing list