[Intel-gfx] [PATCH v3] drm/i915/bdw: BDW Software Turbo

Mon Aug 11 23:33:57 CEST 2014

On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
> BDW supports GT C0 residency reporting in constant time unit. Driver
> calculates GT utilization based on C0 residency and adjusts RP
> frequency up/down accordingly. For offscreen workload specificly,
> set frequency to RP0.
> 
> Offscreen task is not restricted by frame rate, it can be
> executed as soon as possible. Transcoding and serilized workload
> between CPU and GPU both need high GT performance, RP0 is a good
> option in this case. RC6 will kick in to compensate power
> consumption when GT is not active.
> 
> v2: Rebase on recent drm-intel-nightly
> v3: Add flip timerout monitor, when no flip is deteced within
> 100ms, set frequency to RP0.

Ok, let's make this really clear:

If you wire this into the flip handling in any way, I will not merge your
patch. The timer should be fully independant and tie into the gpu
busy/idle handling we already have.

Thanks, Daniel

> 
> Signed-off-by: Daisy Sun <daisy.sun at intel.com>
> [torourke: rebased on latest and resolved conflict]
> Signed-off-by: Tom O'Rourke <Tom.O'Rourke at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h      |  22 ++++
>  drivers/gpu/drm/i915/i915_irq.c      |  21 ++++
>  drivers/gpu/drm/i915/i915_reg.h      |   4 +
>  drivers/gpu/drm/i915/intel_display.c |   3 +
>  drivers/gpu/drm/i915/intel_pm.c      | 230 +++++++++++++++++++++++++++++------
>  5 files changed, 241 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ef38c3b..f1c4c5b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -915,6 +915,23 @@ struct intel_rps_ei {
>  	u32 media_c0;
>  };
>  
> +struct intel_rps_bdw_cal {
> +	u32 it_threshold_pct; /* interrupt, in percentage */
> +	u32 eval_interval; /* evaluation interval, in us */
> +	u32 last_ts;
> +	u32 last_c0;
> +	bool is_up;
> +};
> +
> +struct intel_rps_bdw_turbo {
> +	struct intel_rps_bdw_cal up;
> +	struct intel_rps_bdw_cal down;
> +	struct timer_list flip_timer;
> +	u32 timeout;
> +	atomic_t flip_received;
> +	struct work_struct work_max_freq;
> +};
> +
>  struct intel_gen6_power_mgmt {
>  	/* work and pm_iir are protected by dev_priv->irq_lock */
>  	struct work_struct work;
> @@ -948,6 +965,9 @@ struct intel_gen6_power_mgmt {
>  	bool enabled;
>  	struct delayed_work delayed_resume_work;
>  
> +	bool is_bdw_sw_turbo;	/* Switch of BDW software turbo */
> +	struct intel_rps_bdw_turbo sw_turbo; /* Calculate RP interrupt timing */
> +
>  	/* manual wa residency calculations */
>  	struct intel_rps_ei up_ei, down_ei;
>  
> @@ -2703,6 +2723,8 @@ extern void intel_disable_fbc(struct drm_device *dev);
>  extern bool ironlake_set_drps(struct drm_device *dev, u8 val);
>  extern void intel_init_pch_refclk(struct drm_device *dev);
>  extern void gen6_set_rps(struct drm_device *dev, u8 val);
> +extern void bdw_software_turbo(struct drm_device *dev);
> +extern void gen8_flip_interrupt(struct drm_device *dev);
>  extern void valleyview_set_rps(struct drm_device *dev, u8 val);
>  extern void intel_set_memory_cxsr(struct drm_i915_private *dev_priv,
>  				  bool enable);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6ef9d6f..367f8e1 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1961,6 +1961,27 @@ static void i9xx_pipe_crc_irq_handler(struct drm_device *dev, enum pipe pipe)
>  				     res1, res2);
>  }
>  
> +void gen8_flip_interrupt(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	if (!dev_priv->rps.is_bdw_sw_turbo)
> +		return;
> +
> +	if(atomic_read(&dev_priv->rps.sw_turbo.flip_received)) {
> +		mod_timer(&dev_priv->rps.sw_turbo.flip_timer,
> +				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies);
> +	}
> +	else {
> +		dev_priv->rps.sw_turbo.flip_timer.expires =
> +				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
> +		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
> +	}
> +
> +	bdw_software_turbo(dev);
> +}
> +
>  /* The RPS events need forcewake, so we add them to a work queue and mask their
>   * IMR bits until the work is done. Other interrupts can be processed without
>   * the work queue. */
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index fe5c276..088e0e1 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -5453,6 +5453,10 @@ enum punit_power_well {
>  #define GEN8_UCGCTL6				0x9430
>  #define   GEN8_SDEUNIT_CLOCK_GATE_DISABLE	(1<<14)
>  
> +#define TIMESTAMP_CTR		0x44070
> +#define FREQ_1_28_US(us)	(((us) * 100) >> 7)
> +#define MCHBAR_PCU_C0		(MCHBAR_MIRROR_BASE_SNB + 0x5960)
> +
>  #define GEN6_GFXPAUSE				0xA000
>  #define GEN6_RPNSWREQ				0xA008
>  #define   GEN6_TURBO_DISABLE			(1<<31)
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 99eb7ca..1dd8a7c 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9661,6 +9661,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  	unsigned long flags;
>  	int ret;
>  
> +	//trigger software GT busyness calculation
> +	gen8_flip_interrupt(dev);
> +
>  	/*
>  	 * drm_mode_page_flip_ioctl() should already catch this, but double
>  	 * check to be safe.  In the future we may enable pageflipping from
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 3f88f29..e13d0ff 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -2122,7 +2122,6 @@ int ilk_wm_max_level(const struct drm_device *dev)
>  	else
>  		return 2;
>  }
> -
>  static void intel_print_wm_latency(struct drm_device *dev,
>  				   const char *name,
>  				   const uint16_t wm[5])
> @@ -3091,6 +3090,9 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
>  {
>  	int new_power;
>  
> +	if (dev_priv->rps.is_bdw_sw_turbo)
> +		return;
> +
>  	new_power = dev_priv->rps.power;
>  	switch (dev_priv->rps.power) {
>  	case LOW_POWER:
> @@ -3298,8 +3300,11 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
>  			valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
>  		else if (IS_VALLEYVIEW(dev))
>  			vlv_set_rps_idle(dev_priv);
> -		else
> +		else if (!dev_priv->rps.is_bdw_sw_turbo
> +					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
>  			gen6_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
> +		}
> +
>  		dev_priv->rps.last_adj = 0;
>  	}
>  	mutex_unlock(&dev_priv->rps.hw_lock);
> @@ -3313,8 +3318,11 @@ void gen6_rps_boost(struct drm_i915_private *dev_priv)
>  	if (dev_priv->rps.enabled) {
>  		if (IS_VALLEYVIEW(dev))
>  			valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
> -		else
> +		else if (!dev_priv->rps.is_bdw_sw_turbo
> +					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
>  			gen6_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
> +		}
> +
>  		dev_priv->rps.last_adj = 0;
>  	}
>  	mutex_unlock(&dev_priv->rps.hw_lock);
> @@ -3345,21 +3353,26 @@ void valleyview_set_rps(struct drm_device *dev, u8 val)
>  static void gen8_disable_rps_interrupts(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> +	if (IS_BROADWELL(dev) && dev_priv->rps.is_bdw_sw_turbo){
> +		if (atomic_read(&dev_priv->rps.sw_turbo.flip_received))
> +			del_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		dev_priv-> rps.is_bdw_sw_turbo = false;
> +	} else {
> +		I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
> +		I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
> +					   ~dev_priv->pm_rps_events);
> +		/* Complete PM interrupt masking here doesn't race with the rps work
> +		 * item again unmasking PM interrupts because that is using a different
> +		 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
> +		 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
> +		 * gen8_enable_rps will clean up. */
>  
> -	I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
> -	I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
> -				   ~dev_priv->pm_rps_events);
> -	/* Complete PM interrupt masking here doesn't race with the rps work
> -	 * item again unmasking PM interrupts because that is using a different
> -	 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
> -	 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
> -	 * gen8_enable_rps will clean up. */
> -
> -	spin_lock_irq(&dev_priv->irq_lock);
> -	dev_priv->rps.pm_iir = 0;
> -	spin_unlock_irq(&dev_priv->irq_lock);
> +		spin_lock_irq(&dev_priv->irq_lock);
> +		dev_priv->rps.pm_iir = 0;
> +		spin_unlock_irq(&dev_priv->irq_lock);
>  
> -	I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
> +		I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
> +	}
>  }
>  
>  static void gen6_disable_rps_interrupts(struct drm_device *dev)
> @@ -3511,13 +3524,111 @@ static void parse_rp_state_cap(struct drm_i915_private *dev_priv, u32 rp_state_c
>  		dev_priv->rps.min_freq_softlimit = dev_priv->rps.min_freq;
>  }
>  
> +static void bdw_sw_calculate_freq(struct drm_device *dev,
> +		struct intel_rps_bdw_cal *c, u32 *cur_time, u32 *c0)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	u64 busy = 0;
> +	u32 busyness_pct = 0;
> +	u32 elapsed_time = 0;
> +	u16 new_freq = 0;
> +
> +	if (!c || !cur_time || !c0)
> +		return;
> +
> +	if (0 == c->last_c0)
> +		goto out;
> +
> +	/* Check Evaluation interval */
> +	elapsed_time = *cur_time - c->last_ts;
> +	if (elapsed_time < c->eval_interval)
> +		return;
> +
> +	mutex_lock(&dev_priv->rps.hw_lock);
> +
> +	/*
> +	 * c0 unit in 32*1.28 usec, elapsed_time unit in 1 usec.
> +	 * Whole busyness_pct calculation should be
> +	 *     busy = ((u64)(*c0 - c->last_c0) << 5 << 7) / 100;
> +	 *     busyness_pct = (u32)(busy * 100 / elapsed_time);
> +	 * The final formula is to simplify CPU calculation
> +	 */
> +	busy = (u64)(*c0 - c->last_c0) << 12;
> +	do_div(busy, elapsed_time);
> +	busyness_pct = (u32)busy;
> +
> +	if (c->is_up && busyness_pct >= c->it_threshold_pct)
> +		new_freq = (u16)dev_priv->rps.cur_freq + 3;
> +	if (!c->is_up && busyness_pct <= c->it_threshold_pct)
> +		new_freq = (u16)dev_priv->rps.cur_freq - 1;
> +
> +	/* Adjust to new frequency busyness and compare with threshold */
> +	if (0 != new_freq) {
> +		if (new_freq > dev_priv->rps.max_freq_softlimit)
> +			new_freq = dev_priv->rps.max_freq_softlimit;
> +		else if (new_freq < dev_priv->rps.min_freq_softlimit)
> +			new_freq = dev_priv->rps.min_freq_softlimit;
> +
> +		gen6_set_rps(dev, new_freq);
> +	}
> +
> +	mutex_unlock(&dev_priv->rps.hw_lock);
> +
> +out:
> +	c->last_c0 = *c0;
> +	c->last_ts = *cur_time;
> +}
> +
> +static void gen8_set_frequency_RP0(struct work_struct *work)
> +{
> +	struct intel_rps_bdw_turbo *p_bdw_turbo =
> +			container_of(work, struct intel_rps_bdw_turbo, work_max_freq);
> +	struct intel_gen6_power_mgmt *p_power_mgmt =
> +			container_of(p_bdw_turbo, struct intel_gen6_power_mgmt, sw_turbo);
> +	struct drm_i915_private *dev_priv =
> +			container_of(p_power_mgmt, struct drm_i915_private, rps);
> +
> +	mutex_lock(&dev_priv->rps.hw_lock);
> +	gen6_set_rps(dev_priv->dev, dev_priv->rps.rp0_freq);
> +	mutex_unlock(&dev_priv->rps.hw_lock);
> +}
> +
> +static void flip_active_timeout_handler(unsigned long var)
> +{
> +	struct drm_i915_private *dev_priv = (struct drm_i915_private *) var;
> +
> +	del_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +	atomic_set(&dev_priv->rps.sw_turbo.flip_received, false);
> +
> +	queue_work(dev_priv->wq, &dev_priv->rps.sw_turbo.work_max_freq);
> +}
> +
> +void bdw_software_turbo(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	u32 current_time = I915_READ(TIMESTAMP_CTR); /* unit in usec */
> +	u32 current_c0 = I915_READ(MCHBAR_PCU_C0); /* unit in 32*1.28 usec */
> +
> +	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.up,
> +			&current_time, &current_c0);
> +	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.down,
> +			&current_time, &current_c0);
> +}
> +
>  static void gen8_enable_rps(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct intel_engine_cs *ring;
>  	uint32_t rc6_mask = 0, rp_state_cap;
> +	uint32_t threshold_up_pct, threshold_down_pct;
> +	uint32_t ei_up, ei_down; /* up and down evaluation interval */
> +	u32 rp_ctl_flag;
>  	int unused;
>  
> +	/* Use software Turbo for BDW */
> +	dev_priv->rps.is_bdw_sw_turbo = IS_BROADWELL(dev);
> +
>  	/* 1a: Software RC state - RC0 */
>  	I915_WRITE(GEN6_RC_STATE, 0);
>  
> @@ -3561,35 +3672,74 @@ static void gen8_enable_rps(struct drm_device *dev)
>  		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
>  	I915_WRITE(GEN6_RC_VIDEO_FREQ,
>  		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
> -	/* NB: Docs say 1s, and 1000000 - which aren't equivalent */
> -	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 100000000 / 128); /* 1 second timeout */
> +	ei_up = 84480; /* 84.48ms */
> +	ei_down = 448000;
> +	threshold_up_pct = 90; /* x percent busy */
> +	threshold_down_pct = 70;
> +
> +	if (dev_priv->rps.is_bdw_sw_turbo) {
> +		dev_priv->rps.sw_turbo.up.it_threshold_pct = threshold_up_pct;
> +		dev_priv->rps.sw_turbo.up.eval_interval = ei_up;
> +		dev_priv->rps.sw_turbo.up.is_up = true;
> +		dev_priv->rps.sw_turbo.up.last_ts = 0;
> +		dev_priv->rps.sw_turbo.up.last_c0 = 0;
> +
> +		dev_priv->rps.sw_turbo.down.it_threshold_pct = threshold_down_pct;
> +		dev_priv->rps.sw_turbo.down.eval_interval = ei_down;
> +		dev_priv->rps.sw_turbo.down.is_up = false;
> +		dev_priv->rps.sw_turbo.down.last_ts = 0;
> +		dev_priv->rps.sw_turbo.down.last_c0 = 0;
> +
> +		/* Start the timer to track if flip comes*/
> +		dev_priv->rps.sw_turbo.timeout = 200*1000; /* in us */
> +
> +		init_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		dev_priv->rps.sw_turbo.flip_timer.function = flip_active_timeout_handler;
> +		dev_priv->rps.sw_turbo.flip_timer.data  = (unsigned long) dev_priv;
> +		dev_priv->rps.sw_turbo.flip_timer.expires =
> +			usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
> +		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		INIT_WORK(&dev_priv->rps.sw_turbo.work_max_freq, gen8_set_frequency_RP0);
> +
> +		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
> +	} else {
> +		/* NB: Docs say 1s, and 1000000 - which aren't equivalent
> +		 * 1 second timeout*/
> +		I915_WRITE(GEN6_RP_DOWN_TIMEOUT, FREQ_1_28_US(1000000));
>  
> -	/* Docs recommend 900MHz, and 300 MHz respectively */
> -	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
> -		   dev_priv->rps.max_freq_softlimit << 24 |
> -		   dev_priv->rps.min_freq_softlimit << 16);
> +		/* Docs recommend 900MHz, and 300 MHz respectively */
> +		I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
> +			   dev_priv->rps.max_freq_softlimit << 24 |
> +			   dev_priv->rps.min_freq_softlimit << 16);
>  
> -	I915_WRITE(GEN6_RP_UP_THRESHOLD, 7600000 / 128); /* 76ms busyness per EI, 90% */
> -	I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 31300000 / 128); /* 313ms busyness per EI, 70%*/
> -	I915_WRITE(GEN6_RP_UP_EI, 66000); /* 84.48ms, XXX: random? */
> -	I915_WRITE(GEN6_RP_DOWN_EI, 350000); /* 448ms, XXX: random? */
> +		I915_WRITE(GEN6_RP_UP_THRESHOLD,
> +			FREQ_1_28_US(ei_up * threshold_up_pct / 100));
> +		I915_WRITE(GEN6_RP_DOWN_THRESHOLD,
> +			FREQ_1_28_US(ei_down * threshold_down_pct / 100));
> +		I915_WRITE(GEN6_RP_UP_EI,
> +			FREQ_1_28_US(ei_up));
> +		I915_WRITE(GEN6_RP_DOWN_EI,
> +			FREQ_1_28_US(ei_down));
>  
> -	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
> +		I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
> +	}
>  
>  	/* 5: Enable RPS */
> -	I915_WRITE(GEN6_RP_CONTROL,
> -		   GEN6_RP_MEDIA_TURBO |
> -		   GEN6_RP_MEDIA_HW_NORMAL_MODE |
> -		   GEN6_RP_MEDIA_IS_GFX |
> -		   GEN6_RP_ENABLE |
> -		   GEN6_RP_UP_BUSY_AVG |
> -		   GEN6_RP_DOWN_IDLE_AVG);
> -
> -	/* 6: Ring frequency + overclocking (our driver does this later */
> -
> +	rp_ctl_flag = GEN6_RP_MEDIA_TURBO |
> +					GEN6_RP_MEDIA_HW_NORMAL_MODE |
> +					GEN6_RP_MEDIA_IS_GFX |
> +					GEN6_RP_UP_BUSY_AVG |
> +					GEN6_RP_DOWN_IDLE_AVG;
> +	if (!dev_priv->rps.is_bdw_sw_turbo)
> +		rp_ctl_flag |= GEN6_RP_ENABLE;
> +
> +	I915_WRITE(GEN6_RP_CONTROL, rp_ctl_flag);
> +
> +	/* 6: Ring frequency + overclocking
> +	 * (our driver does this later */
>  	gen6_set_rps(dev, (I915_READ(GEN6_GT_PERF_STATUS) & 0xff00) >> 8);
> -
> -	gen8_enable_rps_interrupts(dev);
> +	if (!dev_priv->rps.is_bdw_sw_turbo)
> +		gen8_enable_rps_interrupts(dev);
>  
>  	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
>  }
> @@ -5018,6 +5168,8 @@ static void intel_gen6_powersave_work(struct work_struct *work)
>  			     rps.delayed_resume_work.work);
>  	struct drm_device *dev = dev_priv->dev;
>  
> +	dev_priv->rps.is_bdw_sw_turbo = false;
> +
>  	mutex_lock(&dev_priv->rps.hw_lock);
>  
>  	if (IS_CHERRYVIEW(dev)) {
> -- 
> 1.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch