[Intel-gfx] [PATCH 2/2] drm/i915: Make sure we have enough memory bandwidth on ICL

Wed Mar 27 14:12:21 UTC 2019

Op 20-03-2019 om 22:46 schreef Ville Syrjala:
> From: Ville Syrjälä <ville.syrjala at linux.intel.com>
>
> ICL has so many planes that it can easily exceed the maximum
> effective memory bandwidth of the system. We must therefore check
> that we don't exceed that limit.
>
> The algorithm is very magic number heavy and lacks sufficient
> explanation for now. We also have no sane way to query the
> memory clock and timings, so we must rely on a combination of
> raw readout from the memory controller and hardcoded assumptions.
> The memory controller values obviously change as the system
> jumps between the different SAGV points, so we try to stabilize
> it first by disabling SAGV for the duration of the readout.
>
> The utilized bandwidth is tracked via a device wide atomic
> private object. That is actually not robust because we can't
> afford to enforce strict global ordering between the pipes.
> Thus I think I'll need to change this to simply chop up the
> available bandwidth between all the active pipes. Each pipe
> can then do whatever it wants as long as it doesn't exceed
> its budget. That scheme will also require that we assume that
> any number of planes could be active at any time.
>
> TODO: make it robust and deal with all the open questions
>
> Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
> ---
>  drivers/gpu/drm/i915/Makefile             |   1 +
>  drivers/gpu/drm/i915/i915_drv.c           | 346 ++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h           |  10 +
>  drivers/gpu/drm/i915/intel_atomic_plane.c |  20 ++
>  drivers/gpu/drm/i915/intel_bw.c           | 190 ++++++++++++
>  drivers/gpu/drm/i915/intel_display.c      |  39 ++-
>  drivers/gpu/drm/i915/intel_drv.h          |  32 ++
>  7 files changed, 637 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/i915/intel_bw.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 68fecf355471..2d24bdd501c4 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -126,6 +126,7 @@ i915-y += intel_audio.o \
>  	  intel_atomic.o \
>  	  intel_atomic_plane.o \
>  	  intel_bios.o \
> +	  intel_bw.o \
>  	  intel_cdclk.o \
>  	  intel_color.o \
>  	  intel_combo_phy.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 8b37ec0e0676..134d1b1a93f1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1451,6 +1451,348 @@ bxt_get_dram_info(struct drm_i915_private *dev_priv)
>  	return 0;
>  }
>  
> +#define SA_PERF_STATUS_0_0_0_MCHBAR_PC _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5918)
> +#define  SKL_QCLK_RATIO_MASK (0x7f << 0)
> +#define  SKL_QCLK_RATIO_SHIT 0
> +#define  SKL_QCLK_REFERENCE (1 << 7)
> +#define  CNL_QCLK_RATIO_MASK (0x7f << 2)
> +#define  CNL_QCLK_RATIO_SHIT 2
> +#define  CNL_QCLK_REFERENCE (1 << 9)
> +#define  ICL_QCLK_RATIO_MASK (0xff << 2)
> +#define  ICL_QCLK_RATIO_SHIT 2
> +#define  ICL_QCLK_REFERENCE (1 << 10)
> +
> +#define MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x4000)
> +#define MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x4400)
> +#define  SKL_DRAM_T_WRPRE_MASK (0x7f << 24)
> +#define  SKL_DRAM_T_WRPRE_SHIFT 24
> +#define  SKL_DRAM_T_RDPRE_MASK (0xf << 16)
> +#define  SKL_DRAM_T_RDPRE_SHIFT 16
> +#define  SKL_DRAM_T_RAS_MASK (0x7f << 8)
> +#define  SKL_DRAM_T_RAS_SHIFT 8
> +#define  SKL_DRAM_T_RPAB_EXT_MASK (0x3 << 6)
> +#define  SKL_DRAM_T_RPAB_EXT_SHIFT 6
> +#define  SKL_DRAM_T_RP_MASK (0x3f << 0)
> +#define  SKL_DRAM_T_RP_SHIFT 0
> +#define  CNL_DRAM_T_WRPRE_MASK (0xff << 24)
> +#define  CNL_DRAM_T_WRPRE_SHIFT 24
> +#define  CNL_DRAM_T_PPD_MASK (0x7 << 21)
> +#define  CNL_DRAM_T_PPD_SHIFT 21
> +#define  CNL_DRAM_T_RDPRE_MASK (0x1f << 16)
> +#define  CNL_DRAM_T_RDPRE_SHIFT 16
> +#define  CNL_DRAM_T_RAS_MASK (0x7f << 9)
> +#define  CNL_DRAM_T_RAS_SHIFT 9
> +#define  CNL_DRAM_T_RPAB_EXT_MASK (0x7 << 6)
> +#define  CNL_DRAM_T_RPAB_EXT_SHIFT 6
> +#define  CNL_DRAM_T_RP_MASK (0x3f << 0)
> +#define  CNL_DRAM_T_RP_SHIFT 0
> +
> +struct intel_dram_timings {
> +	u8 t_rp, t_rdpre, t_ras, t_bl;
> +};
> +
> +static int icl_get_dclk(struct drm_i915_private *dev_priv)
> +{
> +	int ratio, ref;
> +	u32 val;
> +
> +	val = I915_READ(SA_PERF_STATUS_0_0_0_MCHBAR_PC);
> +
> +	DRM_DEBUG_KMS("SA_PERF = 0x%x\n", val);
> +	DRM_DEBUG_KMS("BIOS_DATA = 0x%x\n",
> +		      I915_READ(SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU));
> +
> +	ratio = (val & ICL_QCLK_RATIO_MASK) >> ICL_QCLK_RATIO_SHIT;
> +
> +	if (val & ICL_QCLK_REFERENCE)
> +		ref = 6; /* 6 * 16.666 MHz = 100 MHz */
> +	else
> +		ref = 8; /* 8 * 16.666 MHz = 133 MHz */
> +
> +	return ratio * ref;
> +}
> +
> +#if 0
> +static void skl_get_dram_ch_timings(struct intel_dram_timings *t,
> +				    int channel, enum intel_dram_type type,
> +				    u32 val)
> +{
> +	t->t_rp = (val & SKL_DRAM_T_RP_MASK) >> SKL_DRAM_T_RP_SHIFT;
> +	t->t_rdpre = (val & SKL_DRAM_T_RDPRE_MASK) >> SKL_DRAM_T_RDPRE_SHIFT;
> +	t->t_ras = (val & SKL_DRAM_T_RAS_MASK) >> SKL_DRAM_T_RAS_SHIFT;
> +	t->t_bl = type == INTEL_DRAM_DDR4 ? 4 : 8;
> +
> +	DRM_DEBUG_KMS("CH%d tRP=%d tRDPRE=%d tRAS=%d tBL=%d\n",
> +		      channel, t->t_rp, t->t_rdpre, t->t_ras, t->t_bl);
> +}
> +
> +static void skl_get_dram_timings(struct drm_i915_private *dev_priv,
> +				 const struct dram_info *dram,
> +				 struct intel_dram_timings *t)
> +{
> +	if (dram->channels & BIT(0)) {
> +		u32 val = I915_READ(MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR);
> +
> +		skl_get_dram_ch_timings(t, 0, dram->type, val);
> +	} else if (dram->channels & BIT(1)) {
> +		u32 val = I915_READ(MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR);
> +
> +		skl_get_dram_ch_timings(t, 1, dram->type, val);
> +	}
> +}
> +#endif
> +
> +static void cnl_get_dram_ch_timings(struct intel_dram_timings *t,
> +				    int channel, enum intel_dram_type type,
> +				    u32 val)
> +{
> +	t->t_rp = (val & CNL_DRAM_T_RP_MASK) >> CNL_DRAM_T_RP_SHIFT;
> +	t->t_rdpre = (val & CNL_DRAM_T_RDPRE_MASK) >> CNL_DRAM_T_RDPRE_SHIFT;
> +	t->t_ras = (val & CNL_DRAM_T_RAS_MASK) >> CNL_DRAM_T_RAS_SHIFT;
> +	t->t_bl = type == INTEL_DRAM_DDR4 ? 4 : 8;
> +
> +	DRM_DEBUG_KMS("CH%d tRP=%d tRDPRE=%d tRAS=%d tBL=%d\n",
> +		      channel, t->t_rp, t->t_rdpre, t->t_ras, t->t_bl);
> +}
> +
> +static void cnl_get_dram_timings(struct drm_i915_private *dev_priv,
> +				 const struct dram_info *dram,
> +				 struct intel_dram_timings *t)
> +{
> +	u32 val;
> +
> +	if (dram->channels & BIT(0)) {
> +		val = I915_READ(MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR);
> +		cnl_get_dram_ch_timings(t, 0, dram->type, val);
> +	} else if (dram->channels & BIT(1)) {
> +		val = I915_READ(MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR);
> +		cnl_get_dram_ch_timings(t, 1, dram->type, val);
> +	}
> +}
> +
> +struct intel_sagv_point {
> +	u16 dclk, t_rp, t_rdpre, t_rc, t_ras, t_rcd;
> +};
> +
> +struct intel_sagv_info {
> +	struct intel_sagv_point points[3];
> +	int num_points;
> +};
> +
> +static void icl_get_sagv_points(struct drm_i915_private *dev_priv,
> +				struct intel_sagv_info *si,
> +				const struct intel_dram_timings *t)
> +{
> +	int dclk, i;
> +
> +	dclk = icl_get_dclk(dev_priv);
> +
> +	si->num_points = 3;
> +
> +	/*
> +	 * ICL Hardcoded
> +	 * Name  Description         MC clock(MHz)     DDR data rate(MT / s)  Gear
> +	 * Low   Min voltage point   1066              2133                   2
> +	 * Med   Max DDR rate point  Max DDR freq / 2  Max DDR freq           2
> +	 * High  Min latency point   2667              Same as MC clock       1
> +	 */
> +	si->points[0].dclk = min(64, dclk);
> +	si->points[1].dclk = dclk;
> +	si->points[2].dclk = min(80, dclk);
> +
> +	for (i = 0; i < si->num_points; i++) {
> +		struct intel_sagv_point *sp = &si->points[i];
> +
> +		/*
> +		 * We assume these scale linearly.
> +		 * Seems to match observed behaviour.
> +		 */
> +		sp->t_rp = DIV_ROUND_UP(t->t_rp * sp->dclk, dclk);
> +		sp->t_rdpre = DIV_ROUND_UP(t->t_rdpre * sp->dclk, dclk);
> +		sp->t_ras = DIV_ROUND_UP(t->t_ras * sp->dclk, dclk);
> +
> +		sp->t_rcd = sp->t_rp;
> +		sp->t_rc = sp->t_rp + sp->t_ras;
> +
> +		DRM_DEBUG_KMS("SAGV %d DCLK=%d tRP=%d tRDPRE=%d tRAS=%d tRCD=%d tRC=%d\n",
> +			      i, sp->dclk, sp->t_rp, sp->t_rdpre, sp->t_ras,
> +			      sp->t_rcd, sp->t_rc);
> +	}
> +}
> +
> +static int icl_calc_bw(int dclk, int num, int den)
> +{
> +	/* multiples of 2 x 16.666MHz (100/6) */
> +	return DIV_ROUND_CLOSEST(num * dclk * 2 * 100, den * 6);
> +}
> +
> +static int icl_sagv_max_dclk(const struct intel_sagv_info *si)
> +{
> +	u16 dclk = 0;
> +	int i;
> +
> +	for (i = 0; i < si->num_points; i++)
> +		dclk = max(dclk, si->points[i].dclk);
> +
> +	return dclk;
> +}
> +
> +struct intel_sa_info {
> +	u8 deburst, mpagesize, deprogbwlimit, displayrtids;
> +};
> +
> +static const struct intel_sa_info icl_sa_info = {
> +	.deburst = 8,
> +	.mpagesize = 16,
> +	.deprogbwlimit = 25, /* GB/s */
> +	.displayrtids = 128,
> +};
> +
> +static void icl_get_bw_info(struct drm_i915_private *dev_priv)
> +{
> +	const struct dram_info *dram = &dev_priv->dram_info;
> +	struct intel_sagv_info si = {};
> +	struct intel_dram_timings t = {};
> +	const struct intel_sa_info *sa = &icl_sa_info;
> +	bool is_y_tile = true; /* assume y tile may be used */
> +	int num_channels = hweight8(dram->channels);
> +	int deinterleave;
> +#if 0
> +	int clpchpblock;
> +	int pagelimit;
> +#endif
> +	int ipqdepth, ipqdepthpch;
> +	int dclk_max;
> +	int maxdebw;
> +	int i;
> +
> +	/*
> +	 * Try to muzzle SAGV to prevent it from
> +	 * messing up the memory controller readout.
> +	 */
> +	intel_disable_sagv(dev_priv);
> +
> +	/*
> +	 * Magic sleep to avoid observing very high DDR clock.
> +	 * Not sure what's going on but on a system with DDR4-3200
> +	 * clock of 4800 MT/s is often observed here. A short
> +	 * sleep manages to hide that.. Is that actually
> +	 * the "min latency" SAGV point?. Maybe the SA clocks
> +	 * things way up when there is no memory traffic?
> +	 * But polling the register never seems to show this
> +	 * except during i915 unload/load. Sleeping before the
> +	 * SAGV disable usually returns 2133 MT/s.
> +	 *
> +	 * FIXME what is going on?
> +	 */
> +	msleep(5);
> +
> +	cnl_get_dram_timings(dev_priv, dram, &t);
> +
> +	icl_get_sagv_points(dev_priv, &si, &t);
> +
> +	deinterleave = DIV_ROUND_UP(num_channels, is_y_tile ? 4 : 2);
> +	dclk_max = icl_sagv_max_dclk(&si);
> +
> +	ipqdepthpch = 16;
> +
> +	maxdebw = min(sa->deprogbwlimit * 1000,
> +		      icl_calc_bw(dclk_max, 16, 1) * 6 / 10); /* 60% */
> +#if 0
> +	clpchpblock = deinterleave * 8 / num_channels;
> +	pagelimit = sa->mpagesize * deinterleave * 2 / num_channels;
> +#endif
> +	ipqdepth = min(ipqdepthpch, sa->displayrtids / num_channels);
> +
> +	DRM_DEBUG_KMS("maxdebw = %d\n", maxdebw);
> +	DRM_DEBUG_KMS("ipqdepth = %d\n", ipqdepth);
> +	DRM_DEBUG_KMS("deinterleave = %d\n", deinterleave);
> +	DRM_DEBUG_KMS("dclk_max = %d\n", dclk_max);
> +
> +	for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> +		struct intel_bw_info *bi = &dev_priv->max_bw[i];
> +		int clpchgroup;
> +		int j;
> +
> +		clpchgroup = (sa->deburst * deinterleave / num_channels) << i;
> +		bi->num_planes = (ipqdepth - clpchgroup) / clpchgroup + 1;
> +
> +		DRM_DEBUG_KMS("clpchgroup = %d\n", clpchgroup);
> +		DRM_DEBUG_KMS("num_planes = %d\n", bi->num_planes);
> +
> +		for (j = 0; j < si.num_points; j++) {
> +			const struct intel_sagv_point *sp = &si.points[j];
> +			int ct, bw;
> +
> +			/*
> +			 * Max row cycle time
> +			 *
> +			 * FIXME what is the logic behind the
> +			 * assumed burst length?
> +			 */
> +			ct = max_t(int, sp->t_rc, sp->t_rp + sp->t_rcd +
> +				   (clpchgroup - 1) * t.t_bl + sp->t_rdpre);
> +			bw = icl_calc_bw(sp->dclk, clpchgroup * 32 * num_channels, ct);
> +
> +			DRM_DEBUG_KMS("ct = %d\n", ct);
> +			DRM_DEBUG_KMS("bw = %d\n", bw);
> +
> +			bi->deratedbw[j] = min(maxdebw,
> +					       bw * 9 / 10); /* 90% */
> +		}
> +
> +		if (bi->num_planes == 1)
> +			break;
> +	}
> +}
> +
> +static unsigned int icl_max_bw(struct drm_i915_private *dev_priv,
> +			       int num_planes, int sagv)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> +		const struct intel_bw_info *bi =
> +			&dev_priv->max_bw[i];
> +
> +		if (num_planes >= bi->num_planes)
> +			return bi->deratedbw[sagv];
> +	}
> +
> +	return 0;
> +}
> +
> +unsigned int intel_max_data_rate(struct drm_i915_private *dev_priv,
> +				 int num_planes)
> +{
> +	if (IS_ICELAKE(dev_priv))
> +		/*
> +		 * FIXME with SAGV disabled maybe we can assume
> +		 * point 1 will always be used? Seems to match
> +		 * the behaviour observed in the wild.
> +		 */
> +		return min3(icl_max_bw(dev_priv, num_planes, 0),
> +			    icl_max_bw(dev_priv, num_planes, 1),
> +			    icl_max_bw(dev_priv, num_planes, 2));
> +	else
> +		return UINT_MAX;
> +}
> +
> +static void icl_dump_max_bw(struct drm_i915_private *dev_priv)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> +		const struct intel_bw_info *bi = &dev_priv->max_bw[i];
> +		int j;
> +
> +		for (j = 0; j < ARRAY_SIZE(bi->deratedbw); j++) {
> +			DRM_DEBUG_KMS("BW%d SAGV%d: num_planes=%d deratedbw=%d\n",
> +				      i, j, bi->num_planes, bi->deratedbw[j]);
> +		}
> +	}
> +}
> +
>  static void
>  intel_get_dram_info(struct drm_i915_private *dev_priv)
>  {
> @@ -1629,6 +1971,10 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
>  	 */
>  	intel_get_dram_info(dev_priv);
>  
> +	if (INTEL_GEN(dev_priv) >= 11) {
> +		icl_get_bw_info(dev_priv);
> +		icl_dump_max_bw(dev_priv);
> +	}
>  
>  	return 0;
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f638c0c74955..825bea3176fc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -54,6 +54,7 @@
>  #include <drm/drm_cache.h>
>  #include <drm/drm_util.h>
>  #include <drm/drm_dsc.h>
> +#include <drm/drm_atomic.h>
>  #include <drm/drm_connector.h>
>  #include <drm/i915_mei_hdcp_interface.h>
>  
> @@ -1845,6 +1846,13 @@ struct drm_i915_private {
>  		} type;
>  	} dram_info;
>  
> +	struct intel_bw_info {
> +		int num_planes;
> +		int deratedbw[3];
> +	} max_bw[6];
> +
> +	struct drm_private_obj bw_obj;
> +
>  	struct i915_runtime_pm runtime_pm;
>  
>  	struct {
> @@ -2634,6 +2642,8 @@ extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
>  extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
>  extern void i915_update_gfx_val(struct drm_i915_private *dev_priv);
>  int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on);
> +unsigned int intel_max_data_rate(struct drm_i915_private *dev_priv,
> +				 int num_planes);
>  
>  int intel_engines_init_mmio(struct drm_i915_private *dev_priv);
>  int intel_engines_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/intel_atomic_plane.c b/drivers/gpu/drm/i915/intel_atomic_plane.c
> index 9d32a6fcf840..de6b23ee6306 100644
> --- a/drivers/gpu/drm/i915/intel_atomic_plane.c
> +++ b/drivers/gpu/drm/i915/intel_atomic_plane.c
> @@ -111,6 +111,22 @@ intel_plane_destroy_state(struct drm_plane *plane,
>  	drm_atomic_helper_plane_destroy_state(plane, state);
>  }
>  
> +unsigned int intel_plane_data_rate(const struct intel_crtc_state *crtc_state,
> +				   const struct intel_plane_state *plane_state)
> +{
> +	const struct drm_framebuffer *fb = plane_state->base.fb;
> +	unsigned int cpp = 0;
> +	int i;
> +
> +	if (!plane_state->base.visible)
> +		return 0;
> +
> +	for (i = 0; i < fb->format->num_planes; i++)
> +		cpp += fb->format->cpp[i];
> +
> +	return cpp * crtc_state->pixel_rate;

This doesn't take into account plane width/height? Surely that affects bandwidth as well?

This breaks kms_atomic_transition, which tries to decrease plane size to reduce load when max plane width/height is not supported.

According to this calculation, 6 256x256 overlay planes will take the same bandwidth as 6 planes filled with 4k fb's

~Maarten