[Intel-gfx] [PATCH v5 5/8] drm/i915/skl+: change WM calc to fixed point 16.16
Mahesh Kumar
mahesh1.kumar at intel.com
Tue Nov 22 14:04:40 UTC 2016
On Tuesday 22 November 2016 06:12 PM, Paulo Zanoni wrote:
> Em Sex, 2016-11-18 às 20:39 +0530, Mahesh Kumar escreveu:
>> This patch changes Watermak calculation to fixed point calculation.
>> Problem with current calculation is during plane_blocks_per_line
>> calculation we divide intermediate blocks with min_scanlines and
>> takes floor of the result because of integer operation.
>> hence we end-up assigning less blocks than required. Which leads to
>> flickers.
>>
>> Changes since V1:
>> - Add fixed point data type as per Paulo's review
>>
>> Signed-off-by: Mahesh Kumar <mahesh1.kumar at intel.com>
>> ---
>> drivers/gpu/drm/i915/i915_drv.h | 84
>> +++++++++++++++++++++++++++++++++++++++++
>> drivers/gpu/drm/i915/intel_pm.c | 70 ++++++++++++++++++++-----------
>> ---
>> 2 files changed, 125 insertions(+), 29 deletions(-)
> First of all, there's non-standard indentation all over the code. When
> a statement is broken into multiple lines, sometimes there's way too
> many tabs, sometimes tabs/spaces are missing.
looks like some editor setting went wrong, will fix it.
>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 394d7ea..20691e9 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -119,6 +119,90 @@ bool __i915_inject_load_failure(const char
>> *func, int line);
>> #define i915_inject_load_failure() \
>> __i915_inject_load_failure(__func__, __LINE__)
>>
>> +typedef struct {
>> + uint32_t val;
>> +} uint_fixed_16_16_t;
>> +
>> +#define FP_16_16_MAX ({ \
>> + uint_fixed_16_16_t fp; \
>> + fp.val = UINT_MAX; \
>> + fp; \
>> +})
>> +
>> +static inline uint_fixed_16_16_t u32_to_fp_16_16(uint32_t val)
> In the type name the patch uses "fixed", but in some functions it uses
> "fp". I know that fp is supposed to help function names shorter, but
> IMHO we should stick with the same word all the time. My suggestion is
> to keep "fixed" everywhere, because the definition of "fp" is not
> obvious to everybody (fixed point? floating point? frame pointer?
> functional programming?).
>
>
>> +{
>> + uint_fixed_16_16_t fp;
>> +
>> + WARN_ON(val >> 16);
>> +
>> + fp.val = val << 16;
>> + return fp;
>> +}
>> +
>> +static inline uint32_t fp_16_16_to_u32_round_up(uint_fixed_16_16_t
>> fp)
>> +{
>> + return DIV_ROUND_UP(fp.val, 1 << 16);
>> +}
>> +
>> +static inline uint32_t fp_16_16_to_u32(uint_fixed_16_16_t fp)
>> +{
>> + return (fp.val / 1 << 16);
> This is dividing by 1 and then shifting. Doesn't look correct.
It should have been (fp.val / (1 << 16))
>
>> +}
>> +
>> +static inline uint_fixed_16_16_t min_fp_16_16(uint_fixed_16_16_t
>> min1,
>> + uint_fixed_16_16_t
>> min2)
>> +{
>> + uint_fixed_16_16_t min;
>> +
>> + min.val = min(min1.val, min2.val);
>> + return min;
>> +}
>> +
>> +static inline uint_fixed_16_16_t max_fp_16_16(uint_fixed_16_16_t
>> max1,
>> + uint_fixed_16_16_t
>> max2)
>> +{
>> + uint_fixed_16_16_t max;
>> +
>> + max.val = max(max1.val, max2.val);
>> + return max;
>> +}
>> +
>> +static inline uint_fixed_16_16_t fp_16_16_div_round_up(uint32_t val,
>> uint32_t d)
>> +{
>> + uint_fixed_16_16_t fp, res;
>> +
>> + fp = u32_to_fp_16_16(val);
>> + res.val = DIV_ROUND_UP(fp.val, d);
>> + return res;
>> +}
>> +
>> +static inline uint_fixed_16_16_t fp_16_16_div_round_up_ull(uint64_t
>> val,
>> + uint
>> 32_t d)
> I'd end the name with "u64" instead of "ull" since it's an uint64_t
> type, not unsigned long long.
>
>
>> +{
>> + uint_fixed_16_16_t res;
>> + uint64_t interm_val;
>> +
>> + WARN_ON(val >> 48);
>> + val <<= 16;
>> + interm_val = DIV_ROUND_UP_ULL(val, d);
>> + WARN_ON(interm_val >> 32);
>> + res.val = (uint32_t) interm_val;
>> +
>> + return res;
>> +}
>> +
>> +static inline uint_fixed_16_16_t mul_fp_16_16_u32(uint32_t val,
>> + uint_fixed_16_16_t
>> mul)
>> +{
>> + uint64_t intermediate_val;
>> + uint_fixed_16_16_t fp;
>> +
>> + intermediate_val = (uint64_t) val * mul.val;
>> + WARN_ON(intermediate_val >> 32);
>> + fp.val = (uint32_t) intermediate_val;
>> + return fp;
>> +}
>> +
>> static inline const char *yesno(bool v)
>> {
>> return v ? "yes" : "no";
>> diff --git a/drivers/gpu/drm/i915/intel_pm.c
>> b/drivers/gpu/drm/i915/intel_pm.c
>> index d8090aa..7c788ac 100644
>> --- a/drivers/gpu/drm/i915/intel_pm.c
>> +++ b/drivers/gpu/drm/i915/intel_pm.c
>> @@ -3515,33 +3515,37 @@ skl_allocate_pipe_ddb(struct intel_crtc_state
>> *cstate,
>> * for the read latency) and cpp should always be <= 8, so that
>> * should allow pixel_rate up to ~2 GHz which seems sufficient since
>> max
>> * 2xcdclk is 1350 MHz and the pixel rate should never exceed that.
>> + * Both Method1 & Method2 returns fixedpoint 16.16 output
> This line is not needed anymore: function signatures are saying it now.
>
>
>> */
>> -static uint32_t skl_wm_method1(uint32_t pixel_rate, uint8_t cpp,
>> uint32_t latency)
>> +static uint_fixed_16_16_t skl_wm_method1(uint32_t pixel_rate,
>> uint8_t cpp,
>> + uint32_t latency)
>> {
>> - uint32_t wm_intermediate_val, ret;
>> + uint64_t wm_intermediate_val;
> Why are we using uint64_t now? If we weren't overflowing before, we
> shouldn't be overflowing now, should we? Is this a separate bug fix?
no, it should still not overflow, it was just a precaution, will address
this
thanks,
Regards,
-Mahesh
>
> Everything else looks good. Thanks for the improvements!
>
>
>> + uint_fixed_16_16_t ret;
>>
>> if (latency == 0)
>> - return UINT_MAX;
>> -
>> - wm_intermediate_val = latency * pixel_rate * cpp / 512;
>> - ret = DIV_ROUND_UP(wm_intermediate_val, 1000);
>> + return FP_16_16_MAX;
>>
>> + wm_intermediate_val = latency * pixel_rate * cpp;
>> + ret = fp_16_16_div_round_up_ull(wm_intermediate_val, 1000 *
>> 512);
>> return ret;
>> }
>>
>> -static uint32_t skl_wm_method2(uint32_t pixel_rate, uint32_t
>> pipe_htotal,
>> - uint32_t latency, uint32_t
>> plane_blocks_per_line)
>> +static uint_fixed_16_16_t skl_wm_method2(uint32_t pixel_rate,
>> + uint32_t pipe_htotal,
>> + uint32_t latency,
>> + uint_fixed_16_16_t plane_blocks_per_line)
>> {
>> - uint32_t ret;
>> uint32_t wm_intermediate_val;
>> + uint_fixed_16_16_t ret;
>>
>> if (latency == 0)
>> - return UINT_MAX;
>> + return FP_16_16_MAX;
>>
>> wm_intermediate_val = latency * pixel_rate;
>> - ret = DIV_ROUND_UP(wm_intermediate_val, pipe_htotal * 1000)
>> *
>> - plane_blocks_per_line;
>> -
>> + wm_intermediate_val = DIV_ROUND_UP(wm_intermediate_val,
>> + pipe_htotal
>> * 1000);
>> + ret = mul_fp_16_16_u32(wm_intermediate_val,
>> plane_blocks_per_line);
>> return ret;
>> }
>>
>> @@ -3581,14 +3585,17 @@ static int skl_compute_plane_wm(const struct
>> drm_i915_private *dev_priv,
>> struct drm_plane_state *pstate = &intel_pstate->base;
>> struct drm_framebuffer *fb = pstate->fb;
>> uint32_t latency = dev_priv->wm.skl_latency[level];
>> - uint32_t method1, method2;
>> - uint32_t plane_bytes_per_line, plane_blocks_per_line;
>> + uint_fixed_16_16_t method1, method2;
>> + uint_fixed_16_16_t plane_blocks_per_line;
>> + uint_fixed_16_16_t selected_result;
>> + uint32_t interm_pbpl;
>> + uint32_t plane_bytes_per_line;
>> uint32_t res_blocks, res_lines;
>> - uint32_t selected_result;
>> uint8_t cpp;
>> uint32_t width = 0, height = 0;
>> uint32_t plane_pixel_rate;
>> - uint32_t y_tile_minimum, y_min_scanlines;
>> + uint_fixed_16_16_t y_tile_minimum;
>> + uint32_t y_min_scanlines;
>> struct intel_atomic_state *state =
>> to_intel_atomic_state(cstate->base.state);
>> bool apply_memory_bw_wa = skl_needs_memory_bw_wa(state);
>> @@ -3647,14 +3654,16 @@ static int skl_compute_plane_wm(const struct
>> drm_i915_private *dev_priv,
>>
>> plane_bytes_per_line = width * cpp;
>> if (y_tiled) {
>> + interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line *
>> + y_min_scanlines,
>> 512);
>> plane_blocks_per_line =
>> - DIV_ROUND_UP(plane_bytes_per_line *
>> y_min_scanlines, 512);
>> - plane_blocks_per_line /= y_min_scanlines;
>> + fp_16_16_div_round_up(interm_pbpl,
>> y_min_scanlines);
>> } else if (x_tiled) {
>> - plane_blocks_per_line =
>> DIV_ROUND_UP(plane_bytes_per_line, 512);
>> + interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line,
>> 512);
>> + plane_blocks_per_line =
>> u32_to_fp_16_16(interm_pbpl);
>> } else {
>> - plane_blocks_per_line =
>> DIV_ROUND_UP(plane_bytes_per_line, 512)
>> - + 1;
>> + interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line,
>> 512) + 1;
>> + plane_blocks_per_line =
>> u32_to_fp_16_16(interm_pbpl);
>> }
>>
>> method1 = skl_wm_method1(plane_pixel_rate, cpp, latency);
>> @@ -3663,26 +3672,29 @@ static int skl_compute_plane_wm(const struct
>> drm_i915_private *dev_priv,
>> latency,
>> plane_blocks_per_line);
>>
>> - y_tile_minimum = plane_blocks_per_line * y_min_scanlines;
>> + y_tile_minimum = mul_fp_16_16_u32(y_min_scanlines,
>> + plane_blocks_per_lin
>> e);
>>
>> if (y_tiled) {
>> - selected_result = max(method2, y_tile_minimum);
>> + selected_result = max_fp_16_16(method2,
>> y_tile_minimum);
>> } else {
>> if ((cpp * cstate->base.adjusted_mode.crtc_htotal /
>> 512 < 1) &&
>> (plane_bytes_per_line / 512 < 1))
>> selected_result = method2;
>> - else if ((ddb_allocation / plane_blocks_per_line) >=
>> 1)
>> - selected_result = min(method1, method2);
>> + else if ((ddb_allocation /
>> + fp_16_16_to_u32_round_up(plane_blocks_per_li
>> ne)) >= 1)
>> + selected_result = min_fp_16_16(method1,
>> method2);
>> else
>> selected_result = method1;
>> }
>>
>> - res_blocks = selected_result + 1;
>> - res_lines = DIV_ROUND_UP(selected_result,
>> plane_blocks_per_line);
>> + res_blocks = fp_16_16_to_u32_round_up(selected_result) + 1;
>> + res_lines = DIV_ROUND_UP(selected_result.val,
>> + plane_blocks_per_line.val);
>>
>> if (level >= 1 && level <= 7) {
>> if (y_tiled) {
>> - res_blocks += y_tile_minimum;
>> + res_blocks +=
>> fp_16_16_to_u32_round_up(y_tile_minimum);
>> res_lines += y_min_scanlines;
>> } else {
>> res_blocks++;
More information about the Intel-gfx
mailing list