[Intel-gfx] [PATCH v2] drm/i915/pmu: Measure sampler intervals

Tvrtko Ursulin tvrtko.ursulin at linux.intel.com
Wed May 30 14:37:18 UTC 2018


On 30/05/2018 12:55, Chris Wilson wrote:
> hrtimer is not reliable enough to assume fixed intervals, and so even
> coarse accuracy (in the face of kasan and similar heavy debugging) we
> need to measure the actual interval between sample.
> 
> While using a single timestamp to compute the interval does not allow
> very fine accuracy (consider the impact of a slow forcewake between
> different samples after the timestamp is read) is much better than
> assuming the interval.
> 
> v2: Make use of the sample period for estimating the GPU clock cycles,
> leaving the frequency calculation (the averaging) to the caller.
> Introduce new samplers for reporting cycles instead of Hz.
> 
> Testcase: igt/perf_pmu #ivb
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_pmu.c | 44 ++++++++++++++++++++++++++-------
>   drivers/gpu/drm/i915/i915_pmu.h |  6 +++++
>   include/uapi/drm/i915_drm.h     |  2 ++
>   3 files changed, 43 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index dc87797db500..12033e47e3b4 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -86,6 +86,8 @@ static bool pmu_needs_timer(struct drm_i915_private *i915, bool gpu_active)
>   	 */
>   	enable &= config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY) |
>   		  config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY) |
> +		  config_enabled_mask(I915_PMU_ACTUAL_CLOCK) |
> +		  config_enabled_mask(I915_PMU_REQUESTED_CLOCK) |
>   		  ENGINE_SAMPLE_MASK;
>   
>   	/*
> @@ -127,6 +129,7 @@ static void __i915_pmu_maybe_start_timer(struct drm_i915_private *i915)
>   {
>   	if (!i915->pmu.timer_enabled && pmu_needs_timer(i915, true)) {
>   		i915->pmu.timer_enabled = true;
> +		i915->pmu.timestamp = ktime_get();
>   		hrtimer_start_range_ns(&i915->pmu.timer,
>   				       ns_to_ktime(PERIOD), 0,
>   				       HRTIMER_MODE_REL_PINNED);
> @@ -160,7 +163,7 @@ update_sample(struct i915_pmu_sample *sample, u32 unit, u32 val)
>   	sample->cur += mul_u32_u32(val, unit);
>   }
>   
> -static void engines_sample(struct drm_i915_private *dev_priv)
> +static void engines_sample(struct drm_i915_private *dev_priv, u64 period)
>   {
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
> @@ -183,7 +186,7 @@ static void engines_sample(struct drm_i915_private *dev_priv)
>   		val = !i915_seqno_passed(current_seqno, last_seqno);
>   
>   		update_sample(&engine->pmu.sample[I915_SAMPLE_BUSY],
> -			      PERIOD, val);
> +			      period, val);
>   
>   		if (val && (engine->pmu.enable &
>   		    (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) {
> @@ -195,10 +198,10 @@ static void engines_sample(struct drm_i915_private *dev_priv)
>   		}
>   
>   		update_sample(&engine->pmu.sample[I915_SAMPLE_WAIT],
> -			      PERIOD, !!(val & RING_WAIT));
> +			      period, !!(val & RING_WAIT));
>   
>   		update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA],
> -			      PERIOD, !!(val & RING_WAIT_SEMAPHORE));
> +			      period, !!(val & RING_WAIT_SEMAPHORE));
>   	}
>   
>   	if (fw)
> @@ -207,10 +210,11 @@ static void engines_sample(struct drm_i915_private *dev_priv)
>   	intel_runtime_pm_put(dev_priv);
>   }
>   
> -static void frequency_sample(struct drm_i915_private *dev_priv)
> +static void frequency_sample(struct drm_i915_private *dev_priv, u64 period)
>   {
>   	if (dev_priv->pmu.enable &
> -	    config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY)) {
> +	    (config_enabled_mask(I915_PMU_ACTUAL_CLOCK) |
> +	     config_enabled_mask(I915_PMU_ACTUAL_FREQUENCY))) {
>   		u32 val;
>   
>   		val = dev_priv->gt_pm.rps.cur_freq;
> @@ -223,13 +227,20 @@ static void frequency_sample(struct drm_i915_private *dev_priv)
>   
>   		update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_ACT],
>   			      1, intel_gpu_freq(dev_priv, val));
> +		update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_ACT],
> +			      period, intel_gpu_freq(dev_priv, val));

Cache intel_gpu_freq in a local.

>   	}
>   
>   	if (dev_priv->pmu.enable &
> -	    config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY)) {
> +	    (config_enabled_mask(I915_PMU_REQUESTED_CLOCK) |
> +	     config_enabled_mask(I915_PMU_REQUESTED_FREQUENCY))) {
>   		update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_FREQ_REQ], 1,
>   			      intel_gpu_freq(dev_priv,
>   					     dev_priv->gt_pm.rps.cur_freq));
> +		update_sample(&dev_priv->pmu.sample[__I915_SAMPLE_CLOCK_REQ],
> +			      period,
> +			      intel_gpu_freq(dev_priv,
> +					     dev_priv->gt_pm.rps.cur_freq));

Same here.

>   	}
>   }
>   
> @@ -237,12 +248,17 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
>   {
>   	struct drm_i915_private *i915 =
>   		container_of(hrtimer, struct drm_i915_private, pmu.timer);
> +	ktime_t now, period;
>   
>   	if (!READ_ONCE(i915->pmu.timer_enabled))
>   		return HRTIMER_NORESTART;
>   
> -	engines_sample(i915);
> -	frequency_sample(i915);
> +	now = ktime_get();
> +	period = ktime_sub(now, i915->pmu.timestamp);
> +	i915->pmu.timestamp = now;
> +
> +	engines_sample(i915, period);
> +	frequency_sample(i915, period);
>   
>   	hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD));
>   	return HRTIMER_RESTART;
> @@ -313,11 +329,13 @@ config_status(struct drm_i915_private *i915, u64 config)
>   {
>   	switch (config) {
>   	case I915_PMU_ACTUAL_FREQUENCY:
> +	case I915_PMU_ACTUAL_CLOCK:
>   		if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>   			/* Requires a mutex for sampling! */
>   			return -ENODEV;
>   		/* Fall-through. */
>   	case I915_PMU_REQUESTED_FREQUENCY:
> +	case I915_PMU_REQUESTED_CLOCK:
>   		if (INTEL_GEN(i915) < 6)
>   			return -ENODEV;
>   		break;
> @@ -526,6 +544,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
>   			   div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur,
>   				   FREQUENCY);
>   			break;
> +		case I915_PMU_ACTUAL_CLOCK:
> +			val = i915->pmu.sample[__I915_SAMPLE_CLOCK_ACT].cur;
> +			break;
> +		case I915_PMU_REQUESTED_CLOCK:
> +			val = i915->pmu.sample[__I915_SAMPLE_CLOCK_REQ].cur;
> +			break;
>   		case I915_PMU_INTERRUPTS:
>   			val = count_interrupts(i915);
>   			break;
> @@ -803,6 +827,8 @@ create_event_attributes(struct drm_i915_private *i915)
>   		__event(I915_PMU_REQUESTED_FREQUENCY, "requested-frequency", "MHz"),
>   		__event(I915_PMU_INTERRUPTS, "interrupts", NULL),
>   		__event(I915_PMU_RC6_RESIDENCY, "rc6-residency", "ns"),
> +		__event(I915_PMU_ACTUAL_CLOCK, "actual-clock", "Mcycles"),
> +		__event(I915_PMU_REQUESTED_CLOCK, "requested-clock", "Mcycles"),
>   	};
>   	static const struct {
>   		enum drm_i915_pmu_engine_sample sample;
> diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
> index 2ba735299f7c..9c4252d85b5e 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.h
> +++ b/drivers/gpu/drm/i915/i915_pmu.h
> @@ -17,6 +17,8 @@ struct drm_i915_private;
>   enum {
>   	__I915_SAMPLE_FREQ_ACT = 0,
>   	__I915_SAMPLE_FREQ_REQ,
> +	__I915_SAMPLE_CLOCK_ACT,
> +	__I915_SAMPLE_CLOCK_REQ,
>   	__I915_SAMPLE_RC6,
>   	__I915_SAMPLE_RC6_ESTIMATED,
>   	__I915_NUM_PMU_SAMPLERS
> @@ -52,6 +54,10 @@ struct i915_pmu {
>   	 * @timer: Timer for internal i915 PMU sampling.
>   	 */
>   	struct hrtimer timer;
> +	/**
> +	 * @timestamp: Timestamp of last internal i915 PMU sampling.
> +	 */
> +	ktime_t timestamp;
>   	/**
>   	 * @enable: Bitmask of all currently enabled events.
>   	 *
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 7f5634ce8e88..61ab71986274 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -139,6 +139,8 @@ enum drm_i915_pmu_engine_sample {
>   #define I915_PMU_REQUESTED_FREQUENCY	__I915_PMU_OTHER(1)
>   #define I915_PMU_INTERRUPTS		__I915_PMU_OTHER(2)
>   #define I915_PMU_RC6_RESIDENCY		__I915_PMU_OTHER(3)
> +#define I915_PMU_ACTUAL_CLOCK		__I915_PMU_OTHER(4)
> +#define I915_PMU_REQUESTED_CLOCK	__I915_PMU_OTHER(5)
>   
>   #define I915_PMU_LAST I915_PMU_RC6_RESIDENCY

Bump this one.

I want to know if we could get away without introducing a new pair of 
counter. For instance would running average of a period do for frequency 
readout? It depends on what kind of error we are facing. Or a moving 
average for some period? I would explore that but don't have an 
Ivybridge so could you have a look?

Regards,

Tvrtko


More information about the Intel-gfx mailing list