[Intel-gfx] [PATCH 3/3] drm/i915: Streamline the artihmetic

Thu Apr 30 11:57:44 UTC 2020

On Wed, Apr 29, 2020 at 08:11:04PM +0100, Chris Wilson wrote:
> Quoting Ville Syrjala (2020-04-29 19:54:57)
> > From: Ville Syrjälä <ville.syrjala at linux.intel.com>
> > 
> > All these ROUNDIND_FACTORs and whatnot are making this thing hard to
> > read. Get rid of them. And let's massage some of the fractions to
> > give us less questionable intermediate results and perhaps less
> > divisions.
> > 
> > Also looks like a good helping of 64bit math stuff is needed to
> > avoid some of overflows present in the current code. There
> > might still be a few overflows, namely when calculating
> > link_clks_available/samples_room (would require a huge hblank
> > though), and potentially when calculating hblank_rise (not sure
> > how large link_clks_active can get).
> > 
> > It looks like we're still not calculating exactly what the spec says
> > since we truncate tu_data and tu_line early. But I'm too lazy to
> > figure out if we could avoid that.
> > 
> > Cc: Anshuman Gupta <anshuman.gupta at intel.com>
> > Cc: Uma Shankar <uma.shankar at intel.com>
> > Signed-off-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/display/intel_audio.c | 54 ++++++++--------------
> >  1 file changed, 19 insertions(+), 35 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/display/intel_audio.c b/drivers/gpu/drm/i915/display/intel_audio.c
> > index 00f7a3cf9a04..05cab508c626 100644
> > --- a/drivers/gpu/drm/i915/display/intel_audio.c
> > +++ b/drivers/gpu/drm/i915/display/intel_audio.c
> > @@ -517,16 +517,16 @@ static void hsw_audio_codec_disable(struct intel_encoder *encoder,
> >  /* Add a factor to take care of rounding and truncations */
> >  #define ROUNDING_FACTOR 10000
> >  
> > -static unsigned int get_hblank_early_enable_config(struct intel_encoder *encoder,
> > -                                                  const struct intel_crtc_state *crtc_state)
> > +static unsigned int calc_hblank_early_prog(struct intel_encoder *encoder,
> > +                                          const struct intel_crtc_state *crtc_state)
> >  {
> >         struct drm_i915_private *i915 = to_i915(encoder->base.dev);
> >         unsigned int link_clks_available, link_clks_required;
> >         unsigned int tu_data, tu_line, link_clks_active;
> > -       unsigned int hblank_rise, hblank_early_prog;
> >         unsigned int h_active, h_total, hblank_delta, pixel_clk;
> >         unsigned int fec_coeff, cdclk, vdsc_bpp;
> >         unsigned int link_clk, lanes;
> > +       unsigned int hblank_rise;
> >  
> >         h_active = crtc_state->hw.adjusted_mode.crtc_hdisplay;
> >         h_total = crtc_state->hw.adjusted_mode.crtc_htotal;
> > @@ -542,44 +542,33 @@ static unsigned int get_hblank_early_enable_config(struct intel_encoder *encoder
> >                     "lanes = %u vdsc_bpp = %u cdclk = %u\n",
> >                     h_active, link_clk, lanes, vdsc_bpp, cdclk);
> >  
> > -       if (WARN_ON(!link_clk || !lanes || !vdsc_bpp || !cdclk))
> > +       if (WARN_ON(!link_clk || !pixel_clk || !lanes || !vdsc_bpp || !cdclk))
> >                 return 0;
> >  
> > -       link_clks_available = ((((h_total - h_active) *
> > -                              ((link_clk * ROUNDING_FACTOR) /
> > -                               pixel_clk)) / ROUNDING_FACTOR) - 28);
> > -       link_clks_required = DIV_ROUND_UP(192000, (1000 * pixel_clk / h_total)) * ((48 /
> > -                                         lanes) + 2);
> > +       link_clks_available = (h_total - h_active) * link_clk / pixel_clk - 28;
> > +       link_clks_required = DIV_ROUND_UP(192000 * h_total, 1000 * pixel_clk) * (48 / lanes + 2);
> 
> That's a relief.
> 
> >  
> >         if (link_clks_available > link_clks_required)
> >                 hblank_delta = 32;
> >         else
> > -               hblank_delta = DIV_ROUND_UP(((((5 * ROUNDING_FACTOR) /
> > -                                           link_clk) + ((5 *
> > -                                           ROUNDING_FACTOR) /
> > -                                           cdclk)) * pixel_clk),
> > -                                           ROUNDING_FACTOR);
> > +               hblank_delta = DIV64_U64_ROUND_UP(mul_u32_u32(5 * link_clk + 5 * cdclk, pixel_clk),
> > +                                                 mul_u32_u32(link_clk, cdclk));
> 
> 5 * mul_u32_u32(link_clk, cdclk, pixel_clk)

I presume you meant
5 * mul_u32_u32(link_clk + cdclk, pixel_clk)

mul_u32_u32(5 * (link_clk + cdclk), pixel_clk) 
would seem to be the cheaper option by avoiding
the 64bit multiply.

> 
> >  
> > -       tu_data = (pixel_clk * vdsc_bpp * 8) / ((link_clk *
> > -                  lanes * fec_coeff) / 1000000);
> > -       tu_line = (((h_active * link_clk * fec_coeff) /
> > -                  1000000) / (64 * pixel_clk));
> > +       tu_data = div64_u64(mul_u32_u32(pixel_clk * vdsc_bpp * 8, 1000000),
> > +                           mul_u32_u32(link_clk * lanes, fec_coeff));
> 
> That 1000000 is fec_scale.
> 
> > +       tu_line = div64_u64(h_active * mul_u32_u32(link_clk, fec_coeff),
> > +                           mul_u32_u32(64 * pixel_clk, 1000000));
> >         link_clks_active  = (tu_line - 1) * 64 + tu_data;
> 
> Transformations look correct.
> 
> > -       hblank_rise = ((link_clks_active + 6 * DIV_ROUND_UP(link_clks_active,
> > -                       250) + 4) * ((pixel_clk * ROUNDING_FACTOR) /
> > -                       link_clk)) / ROUNDING_FACTOR;
> > +       hblank_rise = (link_clks_active + 6 * DIV_ROUND_UP(link_clks_active, 250) + 4) * pixel_clk / link_clk;
> >  
> > -       hblank_early_prog = h_active - hblank_rise + hblank_delta;
> > -
> > -       return hblank_early_prog;
> > +       return h_active - hblank_rise + hblank_delta;
> 
> ROUNDING_FACTOR doesn't seem to do any rounding.
> 
> >  }
> >  
> > -static unsigned int get_sample_room_req_config(const struct intel_crtc_state *crtc_state)
> > +static unsigned int calc_samples_room(const struct intel_crtc_state *crtc_state)
> >  {
> >         unsigned int h_active, h_total, pixel_clk;
> >         unsigned int link_clk, lanes;
> > -       unsigned int samples_room;
> >  
> >         h_active = crtc_state->hw.adjusted_mode.hdisplay;
> >         h_total = crtc_state->hw.adjusted_mode.htotal;
> > @@ -587,12 +576,8 @@ static unsigned int get_sample_room_req_config(const struct intel_crtc_state *cr
> >         link_clk = crtc_state->port_clock;
> >         lanes = crtc_state->lane_count;
> >  
> > -       samples_room = ((((h_total - h_active) * ((link_clk *
> > -                       ROUNDING_FACTOR) / pixel_clk)) /
> > -                       ROUNDING_FACTOR) - 12) / ((48 /
> > -                       lanes) + 2);
> > -
> > -       return samples_room;
> > +       return ((h_total - h_active) * link_clk - 12 * pixel_clk) /
> > +               (pixel_clk * (48 / lanes + 2));
> 
> Expansion of fractions looks fine.
> 
> The maths looks to be the same, so
> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
> 
> Don't ask me about the meaning of it though.
> 
> One question might be if there are known inputs/outputs we can build
> into a unit test.

Yeah, that might be a good idea.

Also it looks like we could precompute all this and stuff
the result into the crtc state to get the state checker 
involved. Would also get rid of that ugly cdcdlk.hw usage.

-- 
Ville Syrjälä
Intel