[Intel-gfx] [PATCH v5 5/8] drm/i915/skl+: change WM calc to fixed point 16.16

Tue Nov 22 14:04:40 UTC 2016


On Tuesday 22 November 2016 06:12 PM, Paulo Zanoni wrote:
> Em Sex, 2016-11-18 às 20:39 +0530, Mahesh Kumar escreveu:
>> This patch changes Watermak calculation to fixed point calculation.
>> Problem with current calculation is during plane_blocks_per_line
>> calculation we divide intermediate blocks with min_scanlines and
>> takes floor of the result because of integer operation.
>> hence we end-up assigning less blocks than required. Which leads to
>> flickers.
>>
>> Changes since V1:
>>   - Add fixed point data type as per Paulo's review
>>
>> Signed-off-by: Mahesh Kumar <mahesh1.kumar at intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h | 84
>> +++++++++++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/intel_pm.c | 70 ++++++++++++++++++++-----------
>> ---
>>   2 files changed, 125 insertions(+), 29 deletions(-)
> First of all, there's non-standard indentation all over the code. When
> a statement is broken into multiple lines, sometimes there's way too
> many tabs, sometimes tabs/spaces are missing.
looks like some editor setting went wrong, will fix it.
>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 394d7ea..20691e9 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -119,6 +119,90 @@ bool __i915_inject_load_failure(const char
>> *func, int line);
>>   #define i915_inject_load_failure() \
>>   	__i915_inject_load_failure(__func__, __LINE__)
>>   
>> +typedef struct {
>> +	uint32_t val;
>> +} uint_fixed_16_16_t;
>> +
>> +#define FP_16_16_MAX ({ \
>> +		uint_fixed_16_16_t fp; \
>> +		fp.val = UINT_MAX; \
>> +		fp; \
>> +})
>> +
>> +static inline uint_fixed_16_16_t u32_to_fp_16_16(uint32_t val)
> In the type name the patch uses "fixed", but in some functions it uses
> "fp". I know that fp is supposed to help function names shorter, but
> IMHO we should stick with the same word all the time. My suggestion is
> to keep "fixed" everywhere, because the definition of "fp" is not
> obvious to everybody (fixed point? floating point? frame pointer?
> functional programming?).
>
>
>> +{
>> +	uint_fixed_16_16_t fp;
>> +
>> +	WARN_ON(val >> 16);
>> +
>> +	fp.val = val << 16;
>> +	return fp;
>> +}
>> +
>> +static inline uint32_t fp_16_16_to_u32_round_up(uint_fixed_16_16_t
>> fp)
>> +{
>> +	return DIV_ROUND_UP(fp.val, 1 << 16);
>> +}
>> +
>> +static inline uint32_t fp_16_16_to_u32(uint_fixed_16_16_t fp)
>> +{
>> +	return (fp.val / 1 << 16);
> This is dividing by 1 and then shifting. Doesn't look correct.

	It should have been (fp.val / (1 << 16))

>
>> +}
>> +
>> +static inline uint_fixed_16_16_t min_fp_16_16(uint_fixed_16_16_t
>> min1,
>> +						uint_fixed_16_16_t
>> min2)
>> +{
>> +	uint_fixed_16_16_t min;
>> +
>> +	min.val = min(min1.val, min2.val);
>> +	return min;
>> +}
>> +
>> +static inline uint_fixed_16_16_t max_fp_16_16(uint_fixed_16_16_t
>> max1,
>> +						uint_fixed_16_16_t
>> max2)
>> +{
>> +	uint_fixed_16_16_t max;
>> +
>> +	max.val = max(max1.val, max2.val);
>> +	return max;
>> +}
>> +
>> +static inline uint_fixed_16_16_t fp_16_16_div_round_up(uint32_t val,
>> uint32_t d)
>> +{
>> +	uint_fixed_16_16_t fp, res;
>> +
>> +	fp = u32_to_fp_16_16(val);
>> +	res.val = DIV_ROUND_UP(fp.val, d);
>> +	return res;
>> +}
>> +
>> +static inline uint_fixed_16_16_t fp_16_16_div_round_up_ull(uint64_t
>> val,
>> +								uint
>> 32_t d)
> I'd end the name with "u64" instead of "ull" since it's an uint64_t
> type, not unsigned long long.
>
>
>> +{
>> +	uint_fixed_16_16_t res;
>> +	uint64_t interm_val;
>> +
>> +	WARN_ON(val >> 48);
>> +	val <<= 16;
>> +	interm_val = DIV_ROUND_UP_ULL(val, d);
>> +	WARN_ON(interm_val >> 32);
>> +	res.val = (uint32_t) interm_val;
>> +
>> +	return res;
>> +}
>> +
>> +static inline uint_fixed_16_16_t mul_fp_16_16_u32(uint32_t val,
>> +						uint_fixed_16_16_t
>> mul)
>> +{
>> +	uint64_t intermediate_val;
>> +	uint_fixed_16_16_t fp;
>> +
>> +	intermediate_val = (uint64_t) val * mul.val;
>> +	WARN_ON(intermediate_val >> 32);
>> +	fp.val = (uint32_t) intermediate_val;
>> +	return fp;
>> +}
>> +
>>   static inline const char *yesno(bool v)
>>   {
>>   	return v ? "yes" : "no";
>> diff --git a/drivers/gpu/drm/i915/intel_pm.c
>> b/drivers/gpu/drm/i915/intel_pm.c
>> index d8090aa..7c788ac 100644
>> --- a/drivers/gpu/drm/i915/intel_pm.c
>> +++ b/drivers/gpu/drm/i915/intel_pm.c
>> @@ -3515,33 +3515,37 @@ skl_allocate_pipe_ddb(struct intel_crtc_state
>> *cstate,
>>    * for the read latency) and cpp should always be <= 8, so that
>>    * should allow pixel_rate up to ~2 GHz which seems sufficient since
>> max
>>    * 2xcdclk is 1350 MHz and the pixel rate should never exceed that.
>> + * Both Method1 & Method2 returns fixedpoint 16.16 output
> This line is not needed anymore: function signatures are saying it now.
>
>
>>   */
>> -static uint32_t skl_wm_method1(uint32_t pixel_rate, uint8_t cpp,
>> uint32_t latency)
>> +static uint_fixed_16_16_t skl_wm_method1(uint32_t pixel_rate,
>> uint8_t cpp,
>> +					 uint32_t latency)
>>   {
>> -	uint32_t wm_intermediate_val, ret;
>> +	uint64_t wm_intermediate_val;
> Why are we using uint64_t now? If we weren't overflowing before, we
> shouldn't be overflowing now, should we? Is this a separate bug fix?
no, it should still not overflow, it was just a precaution, will address 
this
thanks,

Regards,
-Mahesh
>
> Everything else looks good. Thanks for the improvements!
>
>
>> +	uint_fixed_16_16_t ret;
>>   
>>   	if (latency == 0)
>> -		return UINT_MAX;
>> -
>> -	wm_intermediate_val = latency * pixel_rate * cpp / 512;
>> -	ret = DIV_ROUND_UP(wm_intermediate_val, 1000);
>> +		return FP_16_16_MAX;
>>   
>> +	wm_intermediate_val = latency * pixel_rate * cpp;
>> +	ret = fp_16_16_div_round_up_ull(wm_intermediate_val, 1000 *
>> 512);
>>   	return ret;
>>   }
>>   
>> -static uint32_t skl_wm_method2(uint32_t pixel_rate, uint32_t
>> pipe_htotal,
>> -			       uint32_t latency, uint32_t
>> plane_blocks_per_line)
>> +static uint_fixed_16_16_t skl_wm_method2(uint32_t pixel_rate,
>> +			uint32_t pipe_htotal,
>> +			uint32_t latency,
>> +			uint_fixed_16_16_t plane_blocks_per_line)
>>   {
>> -	uint32_t ret;
>>   	uint32_t wm_intermediate_val;
>> +	uint_fixed_16_16_t ret;
>>   
>>   	if (latency == 0)
>> -		return UINT_MAX;
>> +		return FP_16_16_MAX;
>>   
>>   	wm_intermediate_val = latency * pixel_rate;
>> -	ret = DIV_ROUND_UP(wm_intermediate_val, pipe_htotal * 1000)
>> *
>> -				plane_blocks_per_line;
>> -
>> +	wm_intermediate_val = DIV_ROUND_UP(wm_intermediate_val,
>> +							pipe_htotal
>> * 1000);
>> +	ret = mul_fp_16_16_u32(wm_intermediate_val,
>> plane_blocks_per_line);
>>   	return ret;
>>   }
>>   
>> @@ -3581,14 +3585,17 @@ static int skl_compute_plane_wm(const struct
>> drm_i915_private *dev_priv,
>>   	struct drm_plane_state *pstate = &intel_pstate->base;
>>   	struct drm_framebuffer *fb = pstate->fb;
>>   	uint32_t latency = dev_priv->wm.skl_latency[level];
>> -	uint32_t method1, method2;
>> -	uint32_t plane_bytes_per_line, plane_blocks_per_line;
>> +	uint_fixed_16_16_t method1, method2;
>> +	uint_fixed_16_16_t plane_blocks_per_line;
>> +	uint_fixed_16_16_t selected_result;
>> +	uint32_t interm_pbpl;
>> +	uint32_t plane_bytes_per_line;
>>   	uint32_t res_blocks, res_lines;
>> -	uint32_t selected_result;
>>   	uint8_t cpp;
>>   	uint32_t width = 0, height = 0;
>>   	uint32_t plane_pixel_rate;
>> -	uint32_t y_tile_minimum, y_min_scanlines;
>> +	uint_fixed_16_16_t y_tile_minimum;
>> +	uint32_t y_min_scanlines;
>>   	struct intel_atomic_state *state =
>>   		to_intel_atomic_state(cstate->base.state);
>>   	bool apply_memory_bw_wa = skl_needs_memory_bw_wa(state);
>> @@ -3647,14 +3654,16 @@ static int skl_compute_plane_wm(const struct
>> drm_i915_private *dev_priv,
>>   
>>   	plane_bytes_per_line = width * cpp;
>>   	if (y_tiled) {
>> +		interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line *
>> +						y_min_scanlines,
>> 512);
>>   		plane_blocks_per_line =
>> -		      DIV_ROUND_UP(plane_bytes_per_line *
>> y_min_scanlines, 512);
>> -		plane_blocks_per_line /= y_min_scanlines;
>> +		      fp_16_16_div_round_up(interm_pbpl,
>> y_min_scanlines);
>>   	} else if (x_tiled) {
>> -		plane_blocks_per_line =
>> DIV_ROUND_UP(plane_bytes_per_line, 512);
>> +		interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line,
>> 512);
>> +		plane_blocks_per_line =
>> u32_to_fp_16_16(interm_pbpl);
>>   	} else {
>> -		plane_blocks_per_line =
>> DIV_ROUND_UP(plane_bytes_per_line, 512)
>> -					+ 1;
>> +		interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line,
>> 512) + 1;
>> +		plane_blocks_per_line =
>> u32_to_fp_16_16(interm_pbpl);
>>   	}
>>   
>>   	method1 = skl_wm_method1(plane_pixel_rate, cpp, latency);
>> @@ -3663,26 +3672,29 @@ static int skl_compute_plane_wm(const struct
>> drm_i915_private *dev_priv,
>>   				 latency,
>>   				 plane_blocks_per_line);
>>   
>> -	y_tile_minimum = plane_blocks_per_line * y_min_scanlines;
>> +	y_tile_minimum = mul_fp_16_16_u32(y_min_scanlines,
>> +						plane_blocks_per_lin
>> e);
>>   
>>   	if (y_tiled) {
>> -		selected_result = max(method2, y_tile_minimum);
>> +		selected_result = max_fp_16_16(method2,
>> y_tile_minimum);
>>   	} else {
>>   		if ((cpp * cstate->base.adjusted_mode.crtc_htotal /
>> 512 < 1) &&
>>   		    (plane_bytes_per_line / 512 < 1))
>>   			selected_result = method2;
>> -		else if ((ddb_allocation / plane_blocks_per_line) >=
>> 1)
>> -			selected_result = min(method1, method2);
>> +		else if ((ddb_allocation /
>> +			fp_16_16_to_u32_round_up(plane_blocks_per_li
>> ne)) >= 1)
>> +			selected_result = min_fp_16_16(method1,
>> method2);
>> 		else
>>   			selected_result = method1;
>>   	}
>>   
>> -	res_blocks = selected_result + 1;
>> -	res_lines = DIV_ROUND_UP(selected_result,
>> plane_blocks_per_line);
>> +	res_blocks = fp_16_16_to_u32_round_up(selected_result) + 1;
>> +	res_lines = DIV_ROUND_UP(selected_result.val,
>> +					plane_blocks_per_line.val);
>>   
>>   	if (level >= 1 && level <= 7) {
>>   		if (y_tiled) {
>> -			res_blocks += y_tile_minimum;
>> +			res_blocks +=
>> fp_16_16_to_u32_round_up(y_tile_minimum);
>>   			res_lines += y_min_scanlines;
>>   		} else {
>>   			res_blocks++;