[Intel-gfx] [PATCH v3 2/2] drm/i915: Make sure we have enough memory bandwidth on ICL

Tue May 14 00:59:26 UTC 2019

On Mon, 2019-05-13 at 17:16 +0300, Ville Syrjälä wrote:
> On Wed, May 08, 2019 at 09:05:06PM +0000, Sripada, Radhakrishna
> wrote:
> > On Fri, 2019-05-03 at 22:08 +0300, Ville Syrjala wrote:
> > > From: Ville Syrjälä <ville.syrjala at linux.intel.com>
> > > 
> > > ICL has so many planes that it can easily exceed the maximum
> > > effective memory bandwidth of the system. We must therefore check
> > > that we don't exceed that limit.
> > > 
> > > The algorithm is very magic number heavy and lacks sufficient
> > > explanation for now. We also have no sane way to query the
> > > memory clock and timings, so we must rely on a combination of
> > > raw readout from the memory controller and hardcoded assumptions.
> > > The memory controller values obviously change as the system
> > > jumps between the different SAGV points, so we try to stabilize
> > > it first by disabling SAGV for the duration of the readout.
> > > 
> > > The utilized bandwidth is tracked via a device wide atomic
> > > private object. That is actually not robust because we can't
> > > afford to enforce strict global ordering between the pipes.
> > > Thus I think I'll need to change this to simply chop up the
> > > available bandwidth between all the active pipes. Each pipe
> > > can then do whatever it wants as long as it doesn't exceed
> > > its budget. That scheme will also require that we assume that
> > > any number of planes could be active at any time.
> > > 
> > > TODO: make it robust and deal with all the open questions
> > > 
> > > v2: Sleep longer after disabling SAGV
> > > v3: Poll for the dclk to get raised (seen it take 250ms!)
> > >     If the system has 2133MT/s memory then we pointlessly
> > >     wait one full second :(
> > > v4: Use the new pcode interface to get the qgv points rather
> > >     that using hardcoded numbers
> > > 
> > > Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/Makefile             |   1 +
> > >  drivers/gpu/drm/i915/i915_drv.c           | 229
> > > ++++++++++++++++++++++
> > >  drivers/gpu/drm/i915/i915_drv.h           |  10 +
> > >  drivers/gpu/drm/i915/i915_reg.h           |   3 +
> > >  drivers/gpu/drm/i915/intel_atomic_plane.c |  20 ++
> > >  drivers/gpu/drm/i915/intel_atomic_plane.h |   2 +
> > >  drivers/gpu/drm/i915/intel_bw.c           | 181
> > > +++++++++++++++++
> > >  drivers/gpu/drm/i915/intel_bw.h           |  46 +++++
> > >  drivers/gpu/drm/i915/intel_display.c      |  40 +++-
> > >  drivers/gpu/drm/i915/intel_drv.h          |   2 +
> > >  10 files changed, 533 insertions(+), 1 deletion(-)
> > >  create mode 100644 drivers/gpu/drm/i915/intel_bw.c
> > >  create mode 100644 drivers/gpu/drm/i915/intel_bw.h
> > > 
> > > diff --git a/drivers/gpu/drm/i915/Makefile
> > > b/drivers/gpu/drm/i915/Makefile
> > > index 68106fe35a04..139a0fc19390 100644
> > > --- a/drivers/gpu/drm/i915/Makefile
> > > +++ b/drivers/gpu/drm/i915/Makefile
> > > @@ -138,6 +138,7 @@ i915-y += intel_audio.o \
> > >  	  intel_atomic.o \
> > >  	  intel_atomic_plane.o \
> > >  	  intel_bios.o \
> > > +	  intel_bw.o \
> > >  	  intel_cdclk.o \
> > >  	  intel_color.o \
> > >  	  intel_combo_phy.o \
> > > diff --git a/drivers/gpu/drm/i915/i915_drv.c
> > > b/drivers/gpu/drm/i915/i915_drv.c
> > > index 5ed864752c7b..b7fa7b51c2e2 100644
> > > --- a/drivers/gpu/drm/i915/i915_drv.c
> > > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > > @@ -70,6 +70,7 @@
> > >  #include "intel_overlay.h"
> > >  #include "intel_pipe_crc.h"
> > >  #include "intel_pm.h"
> > > +#include "intel_sideband.h"
> > >  #include "intel_sprite.h"
> > >  #include "intel_uc.h"
> > >  
> > > @@ -1435,6 +1436,232 @@ bxt_get_dram_info(struct drm_i915_private
> > > *dev_priv)
> > >  	return 0;
> > >  }
> > >  
> > > +struct intel_qgv_point {
> > > +	u16 dclk, t_rp, t_rdpre, t_rc, t_ras, t_rcd;
> > > +};
> > > +
> > > +struct intel_sagv_info {
> > > +	struct intel_qgv_point points[3];
> > > +	u8 num_points;
> > > +	u8 num_channels;
> > > +	u8 t_bl;
> > > +	enum intel_dram_type dram_type;
> > > +};
> > > +
> > > +static int icl_pcode_read_mem_global_info(struct
> > > drm_i915_private
> > > *dev_priv,
> > > +					  struct intel_sagv_info *si)
> > > +{
> > > +	u32 val = 0;
> > > +	int ret;
> > > +
> > > +	ret = sandybridge_pcode_read(dev_priv,
> > > +				     ICL_PCODE_MEM_SUBSYSYSTEM_INFO |
> > > +				     ICL_PCODE_MEM_SS_READ_GLOBAL_INFO,
> > > +				     &val, NULL);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	switch (val & 0xf) {
> > > +	case 0:
> > > +		si->dram_type = INTEL_DRAM_DDR4;
> > > +		break;
> > > +	case 1:
> > > +		si->dram_type = INTEL_DRAM_DDR3;
> > > +		break;
> > > +	case 2:
> > > +		si->dram_type = INTEL_DRAM_LPDDR3;
> > > +		break;
> > > +	case 3:
> > > +		si->dram_type = INTEL_DRAM_LPDDR3;
> > > +		break;
> > > +	default:
> > > +		MISSING_CASE(val & 0xf);
> > > +		break;
> > > +	}
> > > +
> > > +	si->num_channels = (val & 0xf0) >> 4;
> > > +	si->num_points = (val & 0xf00) >> 8;
> > > +
> > > +	si->t_bl = si->dram_type == INTEL_DRAM_DDR4 ? 4 : 8;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int icl_pcode_read_qgv_point_info(struct drm_i915_private
> > > *dev_priv,
> > > +					 struct intel_qgv_point *sp,
> > > +					 int point)
> > Are we trying to retrieve the dram timing parameters to calculate
> > the
> > latency? If so can that be seperated as latency calculation instead
> > of
> > using it under bw info below?
> > > +{
> > > +	u32 val = 0, val2;
> > > +	int ret;
> > > +
> > > +	ret = sandybridge_pcode_read(dev_priv,
> > > +				     ICL_PCODE_MEM_SUBSYSYSTEM_INFO |
> > > +				     ICL_PCODE_MEM_SS_READ_QGV_POINT_IN
> > > FO(point),
> > > +				     &val, &val2);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	sp->dclk = val & 0xffff;
> > > +	sp->t_rp = (val & 0xff0000) >> 16;
> > > +	sp->t_rcd = (val & 0xff000000) >> 24;
> > > +
> > > +	sp->t_rdpre = val2 & 0xff;
> > > +	sp->t_ras = (val2 & 0xff00) >> 8;
> > > +
> > > +	sp->t_rc = sp->t_rp + sp->t_ras;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int icl_get_qgv_points(struct drm_i915_private *dev_priv,
> > > +			      struct intel_sagv_info *si)
> > > +{
> > > +	int i, ret;
> > > +
> > > +	ret = icl_pcode_read_mem_global_info(dev_priv, si);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	if (WARN_ON(si->num_points > ARRAY_SIZE(si->points)))
> > > +		si->num_points = ARRAY_SIZE(si->points);
> > > +
> > > +	for (i = 0; i < si->num_points; i++) {
> > > +		struct intel_qgv_point *sp = &si->points[i];
> > > +
> > > +		ret = icl_pcode_read_qgv_point_info(dev_priv, sp, i);
> > > +		if (ret)
> > > +			return ret;
> > > +
> > > +		DRM_DEBUG_KMS("QGV %d: DCLK=%d tRP=%d tRDPRE=%d tRAS=%d
> > > tRCD=%d tRC=%d\n",
> > > +			      i, sp->dclk, sp->t_rp, sp->t_rdpre, sp-
> > > > t_ras,
> > > +			      sp->t_rcd, sp->t_rc);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int icl_calc_bw(int dclk, int num, int den)
> > > +{
> > > +	/* multiples of 16.666MHz (100/6) */
> > > +	return DIV_ROUND_CLOSEST(num * dclk * 100, den * 6);
> > > +}
> > > +
> > > +static int icl_sagv_max_dclk(const struct intel_sagv_info *si)
> > > +{
> > > +	u16 dclk = 0;
> > > +	int i;
> > > +
> > > +	for (i = 0; i < si->num_points; i++)
> > > +		dclk = max(dclk, si->points[i].dclk);
> > > +
> > > +	return dclk;
> > > +}
> > > +
> > > +struct intel_sa_info {
> > > +	u8 deburst, mpagesize, deprogbwlimit, displayrtids;
> > > +};
> > > +
> > > +static const struct intel_sa_info icl_sa_info = {
> > > +	.deburst = 8,
> > > +	.mpagesize = 16,
> > > +	.deprogbwlimit = 25, /* GB/s */
> > > +	.displayrtids = 128,
> > > +};
> > > +
> > > +static int icl_get_bw_info(struct drm_i915_private *dev_priv)
> > > +{
> > > +	struct intel_sagv_info si = {};
> > > +	const struct intel_sa_info *sa = &icl_sa_info;
> > > +	bool is_y_tile = true; /* assume y tile may be used */
> > > +	int num_channels;
> > > +	int deinterleave;
> > > +	int ipqdepth, ipqdepthpch;
> > > +	int dclk_max;
> > > +	int maxdebw;
> > > +	int i, ret;
> > > +
> > > +	ret = icl_get_qgv_points(dev_priv, &si);
> > > +	if (ret)
> > > +		return ret;
> > > +	num_channels = si.num_channels;
> > > +
> > > +	deinterleave = DIV_ROUND_UP(num_channels, is_y_tile ? 4 : 2);
> > > +	dclk_max = icl_sagv_max_dclk(&si);
> > > +
> > > +	ipqdepthpch = 16;
> > > +
> > > +	maxdebw = min(sa->deprogbwlimit * 1000,
> > > +		      icl_calc_bw(dclk_max, 16, 1) * 6 / 10); /* 60% */
> > > +	ipqdepth = min(ipqdepthpch, sa->displayrtids / num_channels);
> > > +
> > > +	for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> > > +		struct intel_bw_info *bi = &dev_priv->max_bw[i];
> > > +		int clpchgroup;
> > > +		int j;
> > > +
> > > +		clpchgroup = (sa->deburst * deinterleave /
> > > num_channels) << i;
> > > +		bi->num_planes = (ipqdepth - clpchgroup) / clpchgroup +
> > > 1;
> > > +
> > > +		for (j = 0; j < si.num_points; j++) {
> > > +			const struct intel_qgv_point *sp =
> > > &si.points[j];
> > > +			int ct, bw;
> > > +
> > > +			/*
> > > +			 * Max row cycle time
> > > +			 *
> > > +			 * FIXME what is the logic behind the
> > > +			 * assumed burst length?
> > > +			 */
> > > +			ct = max_t(int, sp->t_rc, sp->t_rp + sp->t_rcd
> > > +
> > > +				   (clpchgroup - 1) * si.t_bl + sp-
> > > > t_rdpre);
> > For logical flow can we move the above timing related calculations
> > to a
> > seperate function along with fixme to delink bandwidth and latency
> > calculations?
> 
> I don't see what you would want to delink. This is all just
> calculating the effective memory bandwidth limit.
I am assuming the variables t_rc, t_rp, t_rcd are related to dram
latency/timing information. I was wondering if this latency calculation
is moved to a seperate inline function for readability purposes.

- Radhakrishna(RK) Sripada 
>