[Intel-gfx] [PATCH v2 3/3] drm/i915/vlv: WA for Turbo and RC6 to work together.

deepak.s at intel.com deepak.s at intel.com
Mon Jan 20 14:10:26 CET 2014


From: Deepak S <deepak.s at intel.com>

With RC6 enabled, BYT has an HW issue in determining the right
Gfx busyness.
WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide
on increasing/decreasing the freq. This logic will monitor C0
counters of render/media power-wells over EI period and takes
necessary action based on these values

v2: resolved conflict in i915_reg.h

Signed-off-by: Deepak S <deepak.s at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h |  13 ++++
 drivers/gpu/drm/i915/i915_irq.c | 151 ++++++++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_reg.h |  19 +++++
 drivers/gpu/drm/i915/intel_pm.c |  54 ++++++++++----
 4 files changed, 220 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e89b9f4..1d76461 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -942,10 +942,23 @@ struct intel_gen6_power_mgmt {
 	u8 rp1_delay;
 	u8 rp0_delay;
 	u8 hw_max;
+	u8 hw_min;
 
 	bool rp_up_masked;
 	bool rp_down_masked;
 
+	u32 cz_freq;
+	u32 ei_interrupt_count;
+
+	u32 cz_ts_up_ei;
+	u32 render_up_EI_C0;
+	u32 media_up_EI_C0;
+	u32 cz_ts_down_ei;
+	u32 render_down_EI_C0;
+	u32 media_down_EI_C0;
+
+	bool use_RC0_residency_for_turbo;
+
 	int last_adj;
 	enum { LOW_POWER, BETWEEN, HIGH_POWER } power;
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d0d87ed..f4a3660 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -965,6 +965,123 @@ static void notify_ring(struct drm_device *dev,
 	i915_queue_hangcheck(dev);
 }
 
+/**
+ * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU
+ * busy-ness calculated from C0 counters of render & media power wells
+ * @dev_priv: DRM device private
+ *
+ */
+static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv)
+{
+	u32 cz_ts = 0;
+	u32 render_count = 0, media_count = 0;
+	u32 elapsed_render = 0, elapsed_media = 0;
+	u32 elapsed_time = 0;
+	u32 residency_C0_up = 0, residency_C0_down = 0;
+	u8 new_delay;
+
+	dev_priv->rps.ei_interrupt_count++;
+
+	WARN_ON(!mutex_is_locked(&dev_priv->rps.hw_lock));
+
+	cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP);
+
+	render_count = I915_READ(VLV_RENDER_C0_COUNT_REG);
+	media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG);
+
+	if (0 == dev_priv->rps.cz_ts_up_ei) {
+
+		dev_priv->rps.cz_ts_up_ei = dev_priv->rps.cz_ts_down_ei = cz_ts;
+		dev_priv->rps.render_up_EI_C0 = dev_priv->rps.render_down_EI_C0
+								= render_count;
+		dev_priv->rps.media_up_EI_C0 = dev_priv->rps.media_down_EI_C0
+								= media_count;
+
+		return dev_priv->rps.cur_delay;
+	}
+
+	elapsed_time = cz_ts - dev_priv->rps.cz_ts_up_ei;
+	dev_priv->rps.cz_ts_up_ei = cz_ts;
+
+	elapsed_render = render_count - dev_priv->rps.render_up_EI_C0;
+	dev_priv->rps.render_up_EI_C0 = render_count;
+
+	elapsed_media = media_count - dev_priv->rps.media_up_EI_C0;
+	dev_priv->rps.media_up_EI_C0 = media_count;
+
+	/* Convert all the counters into common unit of milli sec */
+	elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC;
+	elapsed_render /= (dev_priv->rps.cz_freq / 1000);
+	elapsed_media /= (dev_priv->rps.cz_freq / 1000);
+
+	/* Calculate overall C0 residency percentage only
+	* if elapsed time is non zero
+	*/
+	if (elapsed_time) {
+		residency_C0_up = ((max(elapsed_render, elapsed_media)
+							* 100) / elapsed_time);
+	}
+
+	/* To down throttle, C0 residency should be less than down threshold
+	* for continous EI intervals. So calculate down EI counters
+	* once in VLV_INT_COUNT_FOR_DOWN_EI
+	*/
+	if (VLV_INT_COUNT_FOR_DOWN_EI == dev_priv->rps.ei_interrupt_count) {
+
+		dev_priv->rps.ei_interrupt_count = 0;
+
+		elapsed_time = cz_ts - dev_priv->rps.cz_ts_down_ei;
+		dev_priv->rps.cz_ts_down_ei = cz_ts;
+
+		elapsed_render = render_count - dev_priv->rps.render_down_EI_C0;
+		dev_priv->rps.render_down_EI_C0 = render_count;
+
+		elapsed_media = media_count - dev_priv->rps.media_down_EI_C0;
+		dev_priv->rps.media_down_EI_C0 = media_count;
+
+		/* Convert all the counters into common unit of milli sec */
+		elapsed_time /= 100000;
+		elapsed_render /= (dev_priv->rps.cz_freq / 1000);
+		elapsed_media /= (dev_priv->rps.cz_freq / 1000);
+
+		/* Calculate overall C0 residency percentage only
+		* if elapsed time is non zero
+		*/
+		if (elapsed_time) {
+			residency_C0_down =
+				((max(elapsed_render, elapsed_media) * 100)
+					/ elapsed_time);
+		}
+
+	}
+
+	new_delay = dev_priv->rps.cur_delay;
+
+	/* C0 residency is greater than UP threshold. Increase Frequency */
+	if (residency_C0_up >= VLV_RP_UP_EI_THRESHOLD) {
+
+		if (dev_priv->rps.cur_delay < dev_priv->rps.max_delay)
+			new_delay = dev_priv->rps.cur_delay + 1;
+
+		/*
+		 * For better performance, jump directly
+		 * to RPe if we're below it.
+		 */
+		if (new_delay < dev_priv->rps.rpe_delay)
+			new_delay = dev_priv->rps.rpe_delay;
+
+	} else if (!dev_priv->rps.ei_interrupt_count &&
+			(residency_C0_down < VLV_RP_DOWN_EI_THRESHOLD)) {
+		/* This means, C0 residency is less than down threshold over
+		* a period of VLV_INT_COUNT_FOR_DOWN_EI. So, reduce the freq
+		*/
+		if (dev_priv->rps.cur_delay > dev_priv->rps.min_delay)
+			new_delay = dev_priv->rps.cur_delay - 1;
+	}
+
+	return new_delay;
+}
+
 static void gen6_pm_rps_work(struct work_struct *work)
 {
 	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
@@ -976,15 +1093,18 @@ static void gen6_pm_rps_work(struct work_struct *work)
 	pm_iir = dev_priv->rps.pm_iir;
 	dev_priv->rps.pm_iir = 0;
 	/* Make sure not to corrupt PMIMR state used by ringbuffer code */
-	snb_enable_pm_irq(dev_priv, GEN6_PM_RPS_EVENTS);
+	if (dev_priv->rps.use_RC0_residency_for_turbo)
+		snb_enable_pm_irq(dev_priv, GEN6_PM_RP_UP_EI_EXPIRED);
+	else
+		snb_enable_pm_irq(dev_priv, GEN6_PM_RPS_EVENTS);
+
 	spin_unlock_irq(&dev_priv->irq_lock);
 
 	/* Make sure we didn't queue anything we're not going to process. */
-	WARN_ON(pm_iir & ~GEN6_PM_RPS_EVENTS);
+	WARN_ON(pm_iir & ~(GEN6_PM_RPS_EVENTS | GEN6_PM_RP_UP_EI_EXPIRED));
 
-	if ((pm_iir & GEN6_PM_RPS_EVENTS) == 0)
+	if ((pm_iir & (GEN6_PM_RPS_EVENTS | GEN6_PM_RP_UP_EI_EXPIRED)) == 0)
 		return;
-
 	mutex_lock(&dev_priv->rps.hw_lock);
 
 	adj = dev_priv->rps.last_adj;
@@ -1020,6 +1140,8 @@ static void gen6_pm_rps_work(struct work_struct *work)
 		else
 			new_delay = dev_priv->rps.min_delay;
 		adj = 0;
+	} else if (pm_iir & GEN6_PM_RP_UP_EI_EXPIRED) {
+		new_delay = vlv_calc_delay_from_C0_counters(dev_priv);
 	} else if (pm_iir & GEN6_PM_RP_DOWN_THRESHOLD) {
 		if (adj < 0)
 			adj *= 2;
@@ -1433,6 +1555,16 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 		queue_work(dev_priv->wq, &dev_priv->rps.work);
 	}
 
+	if (pm_iir & GEN6_PM_RP_UP_EI_EXPIRED) {
+		spin_lock(&dev_priv->irq_lock);
+		dev_priv->rps.pm_iir |= pm_iir & GEN6_PM_RP_UP_EI_EXPIRED;
+		snb_disable_pm_irq(dev_priv, pm_iir & GEN6_PM_RP_UP_EI_EXPIRED);
+		spin_unlock(&dev_priv->irq_lock);
+		DRM_DEBUG_DRIVER("\nQueueing RPS Work - RC6 WA Turbo");
+
+		queue_work(dev_priv->wq, &dev_priv->rps.work);
+	}
+
 	if (HAS_VEBOX(dev_priv->dev)) {
 		if (pm_iir & PM_VEBOX_USER_INTERRUPT)
 			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
@@ -1513,7 +1645,7 @@ static irqreturn_t valleyview_irq_handler(int irq, void *arg)
 		if (pipe_stats[0] & PIPE_GMBUS_INTERRUPT_STATUS)
 			gmbus_irq_handler(dev);
 
-		if (pm_iir)
+		if (pm_iir & (GEN6_PM_RPS_EVENTS | GEN6_PM_RP_UP_EI_EXPIRED))
 			gen6_rps_irq_handler(dev_priv, pm_iir);
 
 		I915_WRITE(GTIIR, gt_iir);
@@ -2829,6 +2961,15 @@ static void gen5_gt_irq_postinstall(struct drm_device *dev)
 			pm_irqs |= PM_VEBOX_USER_INTERRUPT;
 
 		dev_priv->pm_irq_mask = 0xffffffff;
+
+		if (dev_priv->rps.use_RC0_residency_for_turbo) {
+			dev_priv->pm_irq_mask &= ~GEN6_PM_RP_UP_EI_EXPIRED;
+			pm_irqs |= GEN6_PM_RP_UP_EI_EXPIRED;
+		} else {
+			dev_priv->pm_irq_mask &= ~GEN6_PM_RPS_EVENTS;
+			pm_irqs |= GEN6_PM_RPS_EVENTS;
+		}
+
 		I915_WRITE(GEN6_PMIIR, I915_READ(GEN6_PMIIR));
 		I915_WRITE(GEN6_PMIMR, dev_priv->pm_irq_mask);
 		I915_WRITE(GEN6_PMIER, pm_irqs);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e1d5f31..fa083d4 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -392,6 +392,7 @@
 #define PUNIT_REG_GPU_FREQ_STS			0xd8
 #define   GENFREQSTATUS				(1<<0)
 #define PUNIT_REG_MEDIA_TURBO_FREQ_REQ		0xdc
+#define PUNIT_REG_CZ_TIMESTAMP			0xce
 
 #define PUNIT_FUSE_BUS2				0xf6 /* bits 47:40 */
 #define PUNIT_FUSE_BUS1				0xf5 /* bits 55:48 */
@@ -407,6 +408,15 @@
 #define   FB_FMAX_VMIN_FREQ_LO_SHIFT		27
 #define   FB_FMAX_VMIN_FREQ_LO_MASK		0xf8000000
 
+#define VLV_CZ_CLOCK_FREQ_DDR_MODE_800		200000000
+#define VLV_CZ_CLOCK_FREQ_DDR_MODE_1066	266666666
+#define VLV_CZ_CLOCK_FREQ_DDR_MODE_1333	333333333
+
+#define VLV_CZ_CLOCK_TO_MILLI_SEC		100000
+#define VLV_RP_UP_EI_THRESHOLD			90
+#define VLV_RP_DOWN_EI_THRESHOLD		70
+#define VLV_INT_COUNT_FOR_DOWN_EI		5
+
 /* vlv2 north clock has */
 #define CCK_FUSE_REG				0x8
 #define  CCK_FUSE_HPLL_FREQ_MASK		0x3
@@ -4824,6 +4834,7 @@
 #define  FORCEWAKE_ACK				0x130090
 #define  VLV_GTLC_WAKE_CTRL			0x130090
 #define  VLV_GTLC_PW_STATUS			0x130094
+#define VLV_GTLC_SURVIVABILITY_REG		0x130098
 #define VLV_GTLC_PW_RENDER_STATUS_MASK		0x80
 #define VLV_GTLC_PW_MEDIA_STATUS_MASK		0x20
 #define  FORCEWAKE_MT				0xa188 /* multi-threaded */
@@ -4833,6 +4844,11 @@
 #define  ECOBUS					0xa180
 #define    FORCEWAKE_MT_ENABLE			(1<<5)
 
+#define VLV_GFX_CLK_FORCE_ON_BIT                (1<<2)
+#define VLV_GFX_CLK_STATUS_BIT                  (1<<3)
+
+#define VLV_RC_COUNTER_CONTROL                  0xFFFF00FF
+
 #define  GTFIFODBG				0x120000
 #define    GT_FIFO_SBDROPERR			(1<<6)
 #define    GT_FIFO_BLOBDROPERR			(1<<5)
@@ -4948,6 +4964,9 @@
 #define VLV_GFX_CLK_STATUS_BIT			(1<<3)
 #define VLV_GFX_CLK_FORCE_ON_BIT		(1<<2)
 
+#define VLV_RENDER_C0_COUNT_REG		0x138118
+#define VLV_MEDIA_C0_COUNT_REG			0x13811C
+
 #define GEN6_GT_GFX_RC6_LOCKED			0x138104
 #define VLV_COUNTER_CONTROL			0x138104
 #define   VLV_COUNT_RANGE_HIGH			(1<<15)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 86d87e5..1fd1a00 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3075,7 +3075,11 @@ static void vlv_set_rps_idle(struct drm_i915_private *dev_priv)
 				~VLV_GFX_CLK_FORCE_ON_BIT);
 
 	/* Unmask Turbo interrupts */
-	I915_WRITE(GEN6_PMINTRMSK, ~GEN6_PM_RPS_EVENTS);
+	if (dev_priv->rps.use_RC0_residency_for_turbo)
+		I915_WRITE(GEN6_PMINTRMSK, ~GEN6_PM_RP_UP_EI_EXPIRED);
+	else
+		I915_WRITE(GEN6_PMINTRMSK, ~GEN6_PM_RPS_EVENTS);
+
 }
 
 
@@ -3138,7 +3142,13 @@ static void gen6_disable_rps_interrupts(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	I915_WRITE(GEN6_PMINTRMSK, 0xffffffff);
-	I915_WRITE(GEN6_PMIER, I915_READ(GEN6_PMIER) & ~GEN6_PM_RPS_EVENTS);
+	if (dev_priv->rps.use_RC0_residency_for_turbo) {
+		I915_WRITE(GEN6_PMIER, I915_READ(GEN6_PMIER) &
+						~GEN6_PM_RP_UP_EI_EXPIRED);
+	} else {
+		I915_WRITE(GEN6_PMIER, I915_READ(GEN6_PMIER) &
+						~GEN6_PM_RPS_EVENTS);
+	}
 	/* Complete PM interrupt masking here doesn't race with the rps work
 	 * item again unmasking PM interrupts because that is using a different
 	 * register (PMIMR) to mask PM interrupts. The only risk is in leaving
@@ -3148,7 +3158,10 @@ static void gen6_disable_rps_interrupts(struct drm_device *dev)
 	dev_priv->rps.pm_iir = 0;
 	spin_unlock_irq(&dev_priv->irq_lock);
 
-	I915_WRITE(GEN6_PMIIR, GEN6_PM_RPS_EVENTS);
+	if (dev_priv->rps.use_RC0_residency_for_turbo)
+		I915_WRITE(GEN6_PMIIR, GEN6_PM_RP_UP_EI_EXPIRED);
+	else
+		I915_WRITE(GEN6_PMIIR, GEN6_PM_RPS_EVENTS);
 }
 
 static void gen6_disable_rps(struct drm_device *dev)
@@ -3218,19 +3231,29 @@ static void gen6_enable_rps_interrupts(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 enabled_intrs;
 
+	/* Clear out any stale interrupts first */
 	spin_lock_irq(&dev_priv->irq_lock);
 	WARN_ON(dev_priv->rps.pm_iir);
-	snb_enable_pm_irq(dev_priv, GEN6_PM_RPS_EVENTS);
-	I915_WRITE(GEN6_PMIIR, GEN6_PM_RPS_EVENTS);
+	if (dev_priv->rps.use_RC0_residency_for_turbo) {
+		snb_enable_pm_irq(dev_priv, GEN6_PM_RP_UP_EI_EXPIRED);
+		I915_WRITE(GEN6_PMIIR, GEN6_PM_RP_UP_EI_EXPIRED);
+	} else {
+		snb_enable_pm_irq(dev_priv, GEN6_PM_RPS_EVENTS);
+		I915_WRITE(GEN6_PMIIR, GEN6_PM_RPS_EVENTS);
+	}
 	spin_unlock_irq(&dev_priv->irq_lock);
 
 	/* only unmask PM interrupts we need. Mask all others. */
-	enabled_intrs = GEN6_PM_RPS_EVENTS;
+	if (dev_priv->rps.use_RC0_residency_for_turbo)
+		enabled_intrs = GEN6_PM_RP_UP_EI_EXPIRED;
+	else
+		enabled_intrs = GEN6_PM_RPS_EVENTS;
 
 	/* IVB and SNB hard hangs on looping batchbuffer
 	 * if GEN6_PM_UP_EI_EXPIRED is masked.
 	 */
-	if (INTEL_INFO(dev)->gen <= 7 && !IS_HASWELL(dev))
+	if (INTEL_INFO(dev)->gen <= 7 && !IS_HASWELL(dev) &&
+			!dev_priv->rps.use_RC0_residency_for_turbo)
 		enabled_intrs |= GEN6_PM_RP_UP_EI_EXPIRED;
 
 	I915_WRITE(GEN6_PMINTRMSK, ~enabled_intrs);
@@ -3598,6 +3621,7 @@ static void valleyview_enable_rps(struct drm_device *dev)
 	I915_WRITE(GEN6_RP_DOWN_EI, 350000);
 
 	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
+	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 0xf4240);
 
 	I915_WRITE(GEN6_RP_CONTROL,
 		   GEN6_RP_MEDIA_TURBO |
@@ -3617,10 +3641,7 @@ static void valleyview_enable_rps(struct drm_device *dev)
 	I915_WRITE(GEN6_RC6_THRESHOLD, 0x557);
 
 	/* allows RC6 residency counter to work */
-	I915_WRITE(VLV_COUNTER_CONTROL,
-		   _MASKED_BIT_ENABLE(VLV_COUNT_RANGE_HIGH |
-				      VLV_MEDIA_RC6_COUNT_EN |
-				      VLV_RENDER_RC6_COUNT_EN));
+	I915_WRITE(VLV_COUNTER_CONTROL, VLV_RC_COUNTER_CONTROL);
 	if (intel_enable_rc6(dev) & INTEL_RC6_ENABLE)
 		rc6_mode = GEN7_RC_CTL_TO_MODE | VLV_RC_CTL_CTX_RST_PARALLEL;
 
@@ -3649,7 +3670,9 @@ static void valleyview_enable_rps(struct drm_device *dev)
 			 vlv_gpu_freq(dev_priv, dev_priv->rps.rpe_delay),
 			 dev_priv->rps.rpe_delay);
 
-	dev_priv->rps.min_delay = valleyview_rps_min_freq(dev_priv);
+	dev_priv->rps.hw_min = valleyview_rps_min_freq(dev_priv);
+
+	dev_priv->rps.min_delay = dev_priv->rps.hw_min;
 	DRM_DEBUG_DRIVER("min GPU freq: %d MHz (%u)\n",
 			 vlv_gpu_freq(dev_priv, dev_priv->rps.min_delay),
 			 dev_priv->rps.min_delay);
@@ -3663,6 +3686,9 @@ static void valleyview_enable_rps(struct drm_device *dev)
 
 	valleyview_set_rps(dev_priv->dev, dev_priv->rps.rpe_delay);
 
+	/* enable WA for RC6+turbo to work together */
+	dev_priv->rps.use_RC0_residency_for_turbo = true;
+
 	gen6_enable_rps_interrupts(dev);
 
 	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
@@ -4954,15 +4980,19 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
 	switch ((val >> 6) & 3) {
 	case 0:
 		dev_priv->mem_freq = 800;
+		dev_priv->rps.cz_freq = VLV_CZ_CLOCK_FREQ_DDR_MODE_800;
 		break;
 	case 1:
 		dev_priv->mem_freq = 1066;
+		dev_priv->rps.cz_freq = VLV_CZ_CLOCK_FREQ_DDR_MODE_1066;
 		break;
 	case 2:
 		dev_priv->mem_freq = 1333;
+		dev_priv->rps.cz_freq = VLV_CZ_CLOCK_FREQ_DDR_MODE_1333;
 		break;
 	case 3:
 		dev_priv->mem_freq = 1333;
+		dev_priv->rps.cz_freq = VLV_CZ_CLOCK_FREQ_DDR_MODE_1333;
 		break;
 	}
 	DRM_DEBUG_DRIVER("DDR speed: %d MHz", dev_priv->mem_freq);
-- 
1.8.4.2




More information about the Intel-gfx mailing list