[PATCH v2 01/34] drm/amd/display: fix segment distribution for linear LUTs

Fri Sep 8 14:40:07 UTC 2023

On 2023-09-08 10:11, Melissa Wen wrote:
> On 09/06, Harry Wentland wrote:
>> On 2023-08-10 12:02, Melissa Wen wrote:
>>> From: Harry Wentland <harry.wentland at amd.com>
>>>
>>> The region and segment calculation was incapable of dealing
>>> with regions of more than 16 segments. We first fix this.
>>>
>>> Now that we can support regions up to 256 elements we can
>>> define a better segment distribution for near-linear LUTs
>>> for our maximum of 256 HW-supported points.
>>>
>>> With these changes an "identity" LUT looks visually
>>> indistinguishable from bypass and allows us to use
>>> our 3DLUT.
>>>
>>
>> Have you had a chance to test whether this patch makes a
>> difference? I haven't had the time yet.
> 
> Last time I tested there was a banding issue on plane shaper LUT PQ ->
> Display Native, but it seems I don't have this use case on tester
> anymore, so I wasn't able to double-check if the issue persist. Maybe
> Joshua can provide some inputs here.
> 
> Something I noticed is that shaper LUTs are the only 1D LUT on DCN30
> pipeline that uses cm_helper_translate_curve_to_hw_format(), all others
> (dpp-degamma/dpp-blend/mpc-regamma) call cm3_helper_translate_curve_*.
> 

Yeah, they use different codepaths, unfortunately. Might be nice if we
could make them use the same.

> We can drop it from this series until we get the steps to report the
> issue properly.
> 

Thanks. If you have concrete steps that show the issue (or even better,
an IGT test) I would be happy to include this.

Harry

> Melissa
> 
>>
>> Harry
>>
>>> Signed-off-by: Harry Wentland <harry.wentland at amd.com>
>>> Signed-off-by: Melissa Wen <mwen at igalia.com>
>>> ---
>>>   .../amd/display/dc/dcn10/dcn10_cm_common.c    | 93 +++++++++++++++----
>>>   1 file changed, 75 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
>>> index 3538973bd0c6..04b2e04b68f3 100644
>>> --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
>>> +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
>>> @@ -349,20 +349,37 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx,
>>>   		 * segment is from 2^-10 to 2^1
>>>   		 * There are less than 256 points, for optimization
>>>   		 */
>>> -		seg_distr[0] = 3;
>>> -		seg_distr[1] = 4;
>>> -		seg_distr[2] = 4;
>>> -		seg_distr[3] = 4;
>>> -		seg_distr[4] = 4;
>>> -		seg_distr[5] = 4;
>>> -		seg_distr[6] = 4;
>>> -		seg_distr[7] = 4;
>>> -		seg_distr[8] = 4;
>>> -		seg_distr[9] = 4;
>>> -		seg_distr[10] = 1;
>>> +		if (output_tf->tf == TRANSFER_FUNCTION_LINEAR) {
>>> +			seg_distr[0] = 0; /* 2 */
>>> +			seg_distr[1] = 1; /* 4 */
>>> +			seg_distr[2] = 2; /* 4 */
>>> +			seg_distr[3] = 3; /* 8 */
>>> +			seg_distr[4] = 4; /* 16 */
>>> +			seg_distr[5] = 5; /* 32 */
>>> +			seg_distr[6] = 6; /* 64 */
>>> +			seg_distr[7] = 7; /* 128 */
>>> +
>>> +			region_start = -8;
>>> +			region_end = 1;
>>> +		} else {
>>> +			seg_distr[0] = 3; /* 8 */
>>> +			seg_distr[1] = 4; /* 16 */
>>> +			seg_distr[2] = 4;
>>> +			seg_distr[3] = 4;
>>> +			seg_distr[4] = 4;
>>> +			seg_distr[5] = 4;
>>> +			seg_distr[6] = 4;
>>> +			seg_distr[7] = 4;
>>> +			seg_distr[8] = 4;
>>> +			seg_distr[9] = 4;
>>> +			seg_distr[10] = 1; /* 2 */
>>> +			/* total = 8*16 + 8 + 64 + 2 = */
>>> +
>>> +			region_start = -10;
>>> +			region_end = 1;
>>> +		}
>>> +
>>>   
>>> -		region_start = -10;
>>> -		region_end = 1;
>>>   	}
>>>   
>>>   	for (i = region_end - region_start; i < MAX_REGIONS_NUMBER ; i++)
>>> @@ -375,16 +392,56 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx,
>>>   
>>>   	j = 0;
>>>   	for (k = 0; k < (region_end - region_start); k++) {
>>> -		increment = NUMBER_SW_SEGMENTS / (1 << seg_distr[k]);
>>> +		/*
>>> +		 * We're using an ugly-ish hack here. Our HW allows for
>>> +		 * 256 segments per region but SW_SEGMENTS is 16.
>>> +		 * SW_SEGMENTS has some undocumented relationship to
>>> +		 * the number of points in the tf_pts struct, which
>>> +		 * is 512, unlike what's suggested TRANSFER_FUNC_POINTS.
>>> +		 *
>>> +		 * In order to work past this dilemma we'll scale our
>>> +		 * increment by (1 << 4) and then do the inverse (1 >> 4)
>>> +		 * when accessing the elements in tf_pts.
>>> +		 *
>>> +		 * TODO: find a better way using SW_SEGMENTS and
>>> +		 *       TRANSFER_FUNC_POINTS definitions
>>> +		 */
>>> +		increment = (NUMBER_SW_SEGMENTS << 4) / (1 << seg_distr[k]);
>>>   		start_index = (region_start + k + MAX_LOW_POINT) *
>>>   				NUMBER_SW_SEGMENTS;
>>> -		for (i = start_index; i < start_index + NUMBER_SW_SEGMENTS;
>>> +		for (i = (start_index << 4); i < (start_index << 4) + (NUMBER_SW_SEGMENTS << 4);
>>>   				i += increment) {
>>> +			struct fixed31_32 in_plus_one, in;
>>> +			struct fixed31_32 value, red_value, green_value, blue_value;
>>> +			uint32_t t = i & 0xf;
>>> +
>>>   			if (j == hw_points - 1)
>>>   				break;
>>> -			rgb_resulted[j].red = output_tf->tf_pts.red[i];
>>> -			rgb_resulted[j].green = output_tf->tf_pts.green[i];
>>> -			rgb_resulted[j].blue = output_tf->tf_pts.blue[i];
>>> +
>>> +			in_plus_one = output_tf->tf_pts.red[(i >> 4) + 1];
>>> +			in = output_tf->tf_pts.red[i >> 4];
>>> +			value = dc_fixpt_sub(in_plus_one, in);
>>> +			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
>>> +			value = dc_fixpt_add(in, value);
>>> +			red_value = value;
>>> +
>>> +			in_plus_one = output_tf->tf_pts.green[(i >> 4) + 1];
>>> +			in = output_tf->tf_pts.green[i >> 4];
>>> +			value = dc_fixpt_sub(in_plus_one, in);
>>> +			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
>>> +			value = dc_fixpt_add(in, value);
>>> +			green_value = value;
>>> +
>>> +			in_plus_one = output_tf->tf_pts.blue[(i >> 4) + 1];
>>> +			in = output_tf->tf_pts.blue[i >> 4];
>>> +			value = dc_fixpt_sub(in_plus_one, in);
>>> +			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
>>> +			value = dc_fixpt_add(in, value);
>>> +			blue_value = value;
>>> +
>>> +			rgb_resulted[j].red = red_value;
>>> +			rgb_resulted[j].green = green_value;
>>> +			rgb_resulted[j].blue = blue_value;
>>>   			j++;
>>>   		}
>>>   	}
>>