[Intel-gfx] [PATCH 2/2] drm/i915: Make sure we have enough memory bandwidth on ICL
Maarten Lankhorst
maarten.lankhorst at linux.intel.com
Wed Mar 27 14:12:21 UTC 2019
Op 20-03-2019 om 22:46 schreef Ville Syrjala:
> From: Ville Syrjälä <ville.syrjala at linux.intel.com>
>
> ICL has so many planes that it can easily exceed the maximum
> effective memory bandwidth of the system. We must therefore check
> that we don't exceed that limit.
>
> The algorithm is very magic number heavy and lacks sufficient
> explanation for now. We also have no sane way to query the
> memory clock and timings, so we must rely on a combination of
> raw readout from the memory controller and hardcoded assumptions.
> The memory controller values obviously change as the system
> jumps between the different SAGV points, so we try to stabilize
> it first by disabling SAGV for the duration of the readout.
>
> The utilized bandwidth is tracked via a device wide atomic
> private object. That is actually not robust because we can't
> afford to enforce strict global ordering between the pipes.
> Thus I think I'll need to change this to simply chop up the
> available bandwidth between all the active pipes. Each pipe
> can then do whatever it wants as long as it doesn't exceed
> its budget. That scheme will also require that we assume that
> any number of planes could be active at any time.
>
> TODO: make it robust and deal with all the open questions
>
> Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
> ---
> drivers/gpu/drm/i915/Makefile | 1 +
> drivers/gpu/drm/i915/i915_drv.c | 346 ++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_drv.h | 10 +
> drivers/gpu/drm/i915/intel_atomic_plane.c | 20 ++
> drivers/gpu/drm/i915/intel_bw.c | 190 ++++++++++++
> drivers/gpu/drm/i915/intel_display.c | 39 ++-
> drivers/gpu/drm/i915/intel_drv.h | 32 ++
> 7 files changed, 637 insertions(+), 1 deletion(-)
> create mode 100644 drivers/gpu/drm/i915/intel_bw.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 68fecf355471..2d24bdd501c4 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -126,6 +126,7 @@ i915-y += intel_audio.o \
> intel_atomic.o \
> intel_atomic_plane.o \
> intel_bios.o \
> + intel_bw.o \
> intel_cdclk.o \
> intel_color.o \
> intel_combo_phy.o \
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 8b37ec0e0676..134d1b1a93f1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1451,6 +1451,348 @@ bxt_get_dram_info(struct drm_i915_private *dev_priv)
> return 0;
> }
>
> +#define SA_PERF_STATUS_0_0_0_MCHBAR_PC _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x5918)
> +#define SKL_QCLK_RATIO_MASK (0x7f << 0)
> +#define SKL_QCLK_RATIO_SHIT 0
> +#define SKL_QCLK_REFERENCE (1 << 7)
> +#define CNL_QCLK_RATIO_MASK (0x7f << 2)
> +#define CNL_QCLK_RATIO_SHIT 2
> +#define CNL_QCLK_REFERENCE (1 << 9)
> +#define ICL_QCLK_RATIO_MASK (0xff << 2)
> +#define ICL_QCLK_RATIO_SHIT 2
> +#define ICL_QCLK_REFERENCE (1 << 10)
> +
> +#define MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x4000)
> +#define MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR _MMIO(MCHBAR_MIRROR_BASE_SNB + 0x4400)
> +#define SKL_DRAM_T_WRPRE_MASK (0x7f << 24)
> +#define SKL_DRAM_T_WRPRE_SHIFT 24
> +#define SKL_DRAM_T_RDPRE_MASK (0xf << 16)
> +#define SKL_DRAM_T_RDPRE_SHIFT 16
> +#define SKL_DRAM_T_RAS_MASK (0x7f << 8)
> +#define SKL_DRAM_T_RAS_SHIFT 8
> +#define SKL_DRAM_T_RPAB_EXT_MASK (0x3 << 6)
> +#define SKL_DRAM_T_RPAB_EXT_SHIFT 6
> +#define SKL_DRAM_T_RP_MASK (0x3f << 0)
> +#define SKL_DRAM_T_RP_SHIFT 0
> +#define CNL_DRAM_T_WRPRE_MASK (0xff << 24)
> +#define CNL_DRAM_T_WRPRE_SHIFT 24
> +#define CNL_DRAM_T_PPD_MASK (0x7 << 21)
> +#define CNL_DRAM_T_PPD_SHIFT 21
> +#define CNL_DRAM_T_RDPRE_MASK (0x1f << 16)
> +#define CNL_DRAM_T_RDPRE_SHIFT 16
> +#define CNL_DRAM_T_RAS_MASK (0x7f << 9)
> +#define CNL_DRAM_T_RAS_SHIFT 9
> +#define CNL_DRAM_T_RPAB_EXT_MASK (0x7 << 6)
> +#define CNL_DRAM_T_RPAB_EXT_SHIFT 6
> +#define CNL_DRAM_T_RP_MASK (0x3f << 0)
> +#define CNL_DRAM_T_RP_SHIFT 0
> +
> +struct intel_dram_timings {
> + u8 t_rp, t_rdpre, t_ras, t_bl;
> +};
> +
> +static int icl_get_dclk(struct drm_i915_private *dev_priv)
> +{
> + int ratio, ref;
> + u32 val;
> +
> + val = I915_READ(SA_PERF_STATUS_0_0_0_MCHBAR_PC);
> +
> + DRM_DEBUG_KMS("SA_PERF = 0x%x\n", val);
> + DRM_DEBUG_KMS("BIOS_DATA = 0x%x\n",
> + I915_READ(SKL_MC_BIOS_DATA_0_0_0_MCHBAR_PCU));
> +
> + ratio = (val & ICL_QCLK_RATIO_MASK) >> ICL_QCLK_RATIO_SHIT;
> +
> + if (val & ICL_QCLK_REFERENCE)
> + ref = 6; /* 6 * 16.666 MHz = 100 MHz */
> + else
> + ref = 8; /* 8 * 16.666 MHz = 133 MHz */
> +
> + return ratio * ref;
> +}
> +
> +#if 0
> +static void skl_get_dram_ch_timings(struct intel_dram_timings *t,
> + int channel, enum intel_dram_type type,
> + u32 val)
> +{
> + t->t_rp = (val & SKL_DRAM_T_RP_MASK) >> SKL_DRAM_T_RP_SHIFT;
> + t->t_rdpre = (val & SKL_DRAM_T_RDPRE_MASK) >> SKL_DRAM_T_RDPRE_SHIFT;
> + t->t_ras = (val & SKL_DRAM_T_RAS_MASK) >> SKL_DRAM_T_RAS_SHIFT;
> + t->t_bl = type == INTEL_DRAM_DDR4 ? 4 : 8;
> +
> + DRM_DEBUG_KMS("CH%d tRP=%d tRDPRE=%d tRAS=%d tBL=%d\n",
> + channel, t->t_rp, t->t_rdpre, t->t_ras, t->t_bl);
> +}
> +
> +static void skl_get_dram_timings(struct drm_i915_private *dev_priv,
> + const struct dram_info *dram,
> + struct intel_dram_timings *t)
> +{
> + if (dram->channels & BIT(0)) {
> + u32 val = I915_READ(MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR);
> +
> + skl_get_dram_ch_timings(t, 0, dram->type, val);
> + } else if (dram->channels & BIT(1)) {
> + u32 val = I915_READ(MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR);
> +
> + skl_get_dram_ch_timings(t, 1, dram->type, val);
> + }
> +}
> +#endif
> +
> +static void cnl_get_dram_ch_timings(struct intel_dram_timings *t,
> + int channel, enum intel_dram_type type,
> + u32 val)
> +{
> + t->t_rp = (val & CNL_DRAM_T_RP_MASK) >> CNL_DRAM_T_RP_SHIFT;
> + t->t_rdpre = (val & CNL_DRAM_T_RDPRE_MASK) >> CNL_DRAM_T_RDPRE_SHIFT;
> + t->t_ras = (val & CNL_DRAM_T_RAS_MASK) >> CNL_DRAM_T_RAS_SHIFT;
> + t->t_bl = type == INTEL_DRAM_DDR4 ? 4 : 8;
> +
> + DRM_DEBUG_KMS("CH%d tRP=%d tRDPRE=%d tRAS=%d tBL=%d\n",
> + channel, t->t_rp, t->t_rdpre, t->t_ras, t->t_bl);
> +}
> +
> +static void cnl_get_dram_timings(struct drm_i915_private *dev_priv,
> + const struct dram_info *dram,
> + struct intel_dram_timings *t)
> +{
> + u32 val;
> +
> + if (dram->channels & BIT(0)) {
> + val = I915_READ(MCHBAR_CH0_CR_TC_PRE_0_0_0_MCHBAR);
> + cnl_get_dram_ch_timings(t, 0, dram->type, val);
> + } else if (dram->channels & BIT(1)) {
> + val = I915_READ(MCHBAR_CH1_CR_TC_PRE_0_0_0_MCHBAR);
> + cnl_get_dram_ch_timings(t, 1, dram->type, val);
> + }
> +}
> +
> +struct intel_sagv_point {
> + u16 dclk, t_rp, t_rdpre, t_rc, t_ras, t_rcd;
> +};
> +
> +struct intel_sagv_info {
> + struct intel_sagv_point points[3];
> + int num_points;
> +};
> +
> +static void icl_get_sagv_points(struct drm_i915_private *dev_priv,
> + struct intel_sagv_info *si,
> + const struct intel_dram_timings *t)
> +{
> + int dclk, i;
> +
> + dclk = icl_get_dclk(dev_priv);
> +
> + si->num_points = 3;
> +
> + /*
> + * ICL Hardcoded
> + * Name Description MC clock(MHz) DDR data rate(MT / s) Gear
> + * Low Min voltage point 1066 2133 2
> + * Med Max DDR rate point Max DDR freq / 2 Max DDR freq 2
> + * High Min latency point 2667 Same as MC clock 1
> + */
> + si->points[0].dclk = min(64, dclk);
> + si->points[1].dclk = dclk;
> + si->points[2].dclk = min(80, dclk);
> +
> + for (i = 0; i < si->num_points; i++) {
> + struct intel_sagv_point *sp = &si->points[i];
> +
> + /*
> + * We assume these scale linearly.
> + * Seems to match observed behaviour.
> + */
> + sp->t_rp = DIV_ROUND_UP(t->t_rp * sp->dclk, dclk);
> + sp->t_rdpre = DIV_ROUND_UP(t->t_rdpre * sp->dclk, dclk);
> + sp->t_ras = DIV_ROUND_UP(t->t_ras * sp->dclk, dclk);
> +
> + sp->t_rcd = sp->t_rp;
> + sp->t_rc = sp->t_rp + sp->t_ras;
> +
> + DRM_DEBUG_KMS("SAGV %d DCLK=%d tRP=%d tRDPRE=%d tRAS=%d tRCD=%d tRC=%d\n",
> + i, sp->dclk, sp->t_rp, sp->t_rdpre, sp->t_ras,
> + sp->t_rcd, sp->t_rc);
> + }
> +}
> +
> +static int icl_calc_bw(int dclk, int num, int den)
> +{
> + /* multiples of 2 x 16.666MHz (100/6) */
> + return DIV_ROUND_CLOSEST(num * dclk * 2 * 100, den * 6);
> +}
> +
> +static int icl_sagv_max_dclk(const struct intel_sagv_info *si)
> +{
> + u16 dclk = 0;
> + int i;
> +
> + for (i = 0; i < si->num_points; i++)
> + dclk = max(dclk, si->points[i].dclk);
> +
> + return dclk;
> +}
> +
> +struct intel_sa_info {
> + u8 deburst, mpagesize, deprogbwlimit, displayrtids;
> +};
> +
> +static const struct intel_sa_info icl_sa_info = {
> + .deburst = 8,
> + .mpagesize = 16,
> + .deprogbwlimit = 25, /* GB/s */
> + .displayrtids = 128,
> +};
> +
> +static void icl_get_bw_info(struct drm_i915_private *dev_priv)
> +{
> + const struct dram_info *dram = &dev_priv->dram_info;
> + struct intel_sagv_info si = {};
> + struct intel_dram_timings t = {};
> + const struct intel_sa_info *sa = &icl_sa_info;
> + bool is_y_tile = true; /* assume y tile may be used */
> + int num_channels = hweight8(dram->channels);
> + int deinterleave;
> +#if 0
> + int clpchpblock;
> + int pagelimit;
> +#endif
> + int ipqdepth, ipqdepthpch;
> + int dclk_max;
> + int maxdebw;
> + int i;
> +
> + /*
> + * Try to muzzle SAGV to prevent it from
> + * messing up the memory controller readout.
> + */
> + intel_disable_sagv(dev_priv);
> +
> + /*
> + * Magic sleep to avoid observing very high DDR clock.
> + * Not sure what's going on but on a system with DDR4-3200
> + * clock of 4800 MT/s is often observed here. A short
> + * sleep manages to hide that.. Is that actually
> + * the "min latency" SAGV point?. Maybe the SA clocks
> + * things way up when there is no memory traffic?
> + * But polling the register never seems to show this
> + * except during i915 unload/load. Sleeping before the
> + * SAGV disable usually returns 2133 MT/s.
> + *
> + * FIXME what is going on?
> + */
> + msleep(5);
> +
> + cnl_get_dram_timings(dev_priv, dram, &t);
> +
> + icl_get_sagv_points(dev_priv, &si, &t);
> +
> + deinterleave = DIV_ROUND_UP(num_channels, is_y_tile ? 4 : 2);
> + dclk_max = icl_sagv_max_dclk(&si);
> +
> + ipqdepthpch = 16;
> +
> + maxdebw = min(sa->deprogbwlimit * 1000,
> + icl_calc_bw(dclk_max, 16, 1) * 6 / 10); /* 60% */
> +#if 0
> + clpchpblock = deinterleave * 8 / num_channels;
> + pagelimit = sa->mpagesize * deinterleave * 2 / num_channels;
> +#endif
> + ipqdepth = min(ipqdepthpch, sa->displayrtids / num_channels);
> +
> + DRM_DEBUG_KMS("maxdebw = %d\n", maxdebw);
> + DRM_DEBUG_KMS("ipqdepth = %d\n", ipqdepth);
> + DRM_DEBUG_KMS("deinterleave = %d\n", deinterleave);
> + DRM_DEBUG_KMS("dclk_max = %d\n", dclk_max);
> +
> + for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> + struct intel_bw_info *bi = &dev_priv->max_bw[i];
> + int clpchgroup;
> + int j;
> +
> + clpchgroup = (sa->deburst * deinterleave / num_channels) << i;
> + bi->num_planes = (ipqdepth - clpchgroup) / clpchgroup + 1;
> +
> + DRM_DEBUG_KMS("clpchgroup = %d\n", clpchgroup);
> + DRM_DEBUG_KMS("num_planes = %d\n", bi->num_planes);
> +
> + for (j = 0; j < si.num_points; j++) {
> + const struct intel_sagv_point *sp = &si.points[j];
> + int ct, bw;
> +
> + /*
> + * Max row cycle time
> + *
> + * FIXME what is the logic behind the
> + * assumed burst length?
> + */
> + ct = max_t(int, sp->t_rc, sp->t_rp + sp->t_rcd +
> + (clpchgroup - 1) * t.t_bl + sp->t_rdpre);
> + bw = icl_calc_bw(sp->dclk, clpchgroup * 32 * num_channels, ct);
> +
> + DRM_DEBUG_KMS("ct = %d\n", ct);
> + DRM_DEBUG_KMS("bw = %d\n", bw);
> +
> + bi->deratedbw[j] = min(maxdebw,
> + bw * 9 / 10); /* 90% */
> + }
> +
> + if (bi->num_planes == 1)
> + break;
> + }
> +}
> +
> +static unsigned int icl_max_bw(struct drm_i915_private *dev_priv,
> + int num_planes, int sagv)
> +{
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> + const struct intel_bw_info *bi =
> + &dev_priv->max_bw[i];
> +
> + if (num_planes >= bi->num_planes)
> + return bi->deratedbw[sagv];
> + }
> +
> + return 0;
> +}
> +
> +unsigned int intel_max_data_rate(struct drm_i915_private *dev_priv,
> + int num_planes)
> +{
> + if (IS_ICELAKE(dev_priv))
> + /*
> + * FIXME with SAGV disabled maybe we can assume
> + * point 1 will always be used? Seems to match
> + * the behaviour observed in the wild.
> + */
> + return min3(icl_max_bw(dev_priv, num_planes, 0),
> + icl_max_bw(dev_priv, num_planes, 1),
> + icl_max_bw(dev_priv, num_planes, 2));
> + else
> + return UINT_MAX;
> +}
> +
> +static void icl_dump_max_bw(struct drm_i915_private *dev_priv)
> +{
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(dev_priv->max_bw); i++) {
> + const struct intel_bw_info *bi = &dev_priv->max_bw[i];
> + int j;
> +
> + for (j = 0; j < ARRAY_SIZE(bi->deratedbw); j++) {
> + DRM_DEBUG_KMS("BW%d SAGV%d: num_planes=%d deratedbw=%d\n",
> + i, j, bi->num_planes, bi->deratedbw[j]);
> + }
> + }
> +}
> +
> static void
> intel_get_dram_info(struct drm_i915_private *dev_priv)
> {
> @@ -1629,6 +1971,10 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
> */
> intel_get_dram_info(dev_priv);
>
> + if (INTEL_GEN(dev_priv) >= 11) {
> + icl_get_bw_info(dev_priv);
> + icl_dump_max_bw(dev_priv);
> + }
>
> return 0;
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f638c0c74955..825bea3176fc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -54,6 +54,7 @@
> #include <drm/drm_cache.h>
> #include <drm/drm_util.h>
> #include <drm/drm_dsc.h>
> +#include <drm/drm_atomic.h>
> #include <drm/drm_connector.h>
> #include <drm/i915_mei_hdcp_interface.h>
>
> @@ -1845,6 +1846,13 @@ struct drm_i915_private {
> } type;
> } dram_info;
>
> + struct intel_bw_info {
> + int num_planes;
> + int deratedbw[3];
> + } max_bw[6];
> +
> + struct drm_private_obj bw_obj;
> +
> struct i915_runtime_pm runtime_pm;
>
> struct {
> @@ -2634,6 +2642,8 @@ extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
> extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
> extern void i915_update_gfx_val(struct drm_i915_private *dev_priv);
> int vlv_force_gfx_clock(struct drm_i915_private *dev_priv, bool on);
> +unsigned int intel_max_data_rate(struct drm_i915_private *dev_priv,
> + int num_planes);
>
> int intel_engines_init_mmio(struct drm_i915_private *dev_priv);
> int intel_engines_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/intel_atomic_plane.c b/drivers/gpu/drm/i915/intel_atomic_plane.c
> index 9d32a6fcf840..de6b23ee6306 100644
> --- a/drivers/gpu/drm/i915/intel_atomic_plane.c
> +++ b/drivers/gpu/drm/i915/intel_atomic_plane.c
> @@ -111,6 +111,22 @@ intel_plane_destroy_state(struct drm_plane *plane,
> drm_atomic_helper_plane_destroy_state(plane, state);
> }
>
> +unsigned int intel_plane_data_rate(const struct intel_crtc_state *crtc_state,
> + const struct intel_plane_state *plane_state)
> +{
> + const struct drm_framebuffer *fb = plane_state->base.fb;
> + unsigned int cpp = 0;
> + int i;
> +
> + if (!plane_state->base.visible)
> + return 0;
> +
> + for (i = 0; i < fb->format->num_planes; i++)
> + cpp += fb->format->cpp[i];
> +
> + return cpp * crtc_state->pixel_rate;
This doesn't take into account plane width/height? Surely that affects bandwidth as well?
This breaks kms_atomic_transition, which tries to decrease plane size to reduce load when max plane width/height is not supported.
According to this calculation, 6 256x256 overlay planes will take the same bandwidth as 6 planes filled with 4k fb's
~Maarten
More information about the Intel-gfx
mailing list