[Intel-gfx] [PATCH v2 4/4] drm/i915/icl: Add Multi-segmented gamma support

Mon May 6 12:55:19 UTC 2019

On 5/6/2019 5:55 PM, Ville Syrjälä wrote:
> On Mon, May 06, 2019 at 04:09:33PM +0530, Sharma, Shashank wrote:
>> Regards
>>
>> Shashank
>>
>> On 5/3/2019 9:20 PM, Ville Syrjälä wrote:
>>> On Tue, Apr 30, 2019 at 08:51:08PM +0530, Shashank Sharma wrote:
>>>> ICL introduces a new gamma correction mode in display engine, called
>>>> multi-segmented-gamma mode. This mode allows users to program the
>>>> darker region of the gamma curve with sueprfine precision. An
>>>> example use case for this is HDR curves (like PQ ST-2084).
>>>>
>>>> If we plot a gamma correction curve from value range between 0.0 to 1.0,
>>>> ICL's multi-segment has 3 different sections:
>>>> - superfine segment: 9 values, ranges between 0 - 1/(128 * 256)
>>>> - fine segment: 257 values, ranges between 0 - 1/(128)
>>>> - corase segment: 257 values, ranges between 0 - 1
>>>>
>>>> This patch:
>>>> - Changes gamma LUTs size for ICL/GEN11 to 262144 entries (8 * 128 * 256),
>>>>     so that userspace can program with highest precision supported.
>>>> - Changes default gamma mode (non-legacy) to multi-segmented-gamma mode.
>>>> - Adds functions to program/detect multi-segment gamma.
>>>>
>>>> V2: Addressed review comments from Ville
>>>>       - separate function for superfine and fine segments.
>>>>       - remove enum for segments.
>>>>       - reuse last entry of the LUT as gc_max value.
>>>>       - replace if() ....cond with switch...case in icl_load_luts.
>>>>       - add an entry variable, instead of 'word'
>>>>
>>>> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
>>>> Cc: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
>>>> Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
>>>>
>>>> Suggested-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma at intel.com>
>>>> Signed-off-by: Uma Shankar <uma.shankar at intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_pci.c    |   3 +-
>>>>    drivers/gpu/drm/i915/intel_color.c | 125 ++++++++++++++++++++++++++++-
>>>>    2 files changed, 123 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
>>>> index ffa2ee70a03d..83698951760b 100644
>>>> --- a/drivers/gpu/drm/i915/i915_pci.c
>>>> +++ b/drivers/gpu/drm/i915/i915_pci.c
>>>> @@ -749,7 +749,8 @@ static const struct intel_device_info intel_cannonlake_info = {
>>>>    	GEN(11), \
>>>>    	.ddb_size = 2048, \
>>>>    	.has_logical_ring_elsq = 1, \
>>>> -	.color = { .degamma_lut_size = 33, .gamma_lut_size = 1024 }
>>>> +	.color = { .degamma_lut_size = 33, .gamma_lut_size = 262144 }
>>> Ugh. Thats one big LUT. But looks correct.
>>>
>>>> +
>>> Bogus newline.
>> Got it.
>>>>    
>>>>    static const struct intel_device_info intel_icelake_11_info = {
>>>>    	GEN11_FEATURES,
>>>> diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c
>>>> index 6c341bea514c..49831e8d02fb 100644
>>>> --- a/drivers/gpu/drm/i915/intel_color.c
>>>> +++ b/drivers/gpu/drm/i915/intel_color.c
>>>> @@ -41,6 +41,9 @@
>>>>    #define CTM_COEFF_ABS(coeff)		((coeff) & (CTM_COEFF_SIGN - 1))
>>>>    
>>>>    #define LEGACY_LUT_LENGTH		256
>>>> +#define ICL_GAMMA_MULTISEG_LUT_LENGTH		(256 * 128 * 8)
>>>> +#define ICL_GAMMA_SUPERFINE_SEG_LENGTH	9
>>>> +
>>>>    /*
>>>>     * Extract the CSC coefficient from a CTM coefficient (in U32.32 fixed point
>>>>     * format). This macro takes the coefficient we want transformed and the
>>>> @@ -767,6 +770,113 @@ static void glk_load_luts(const struct intel_crtc_state *crtc_state)
>>>>    	}
>>>>    }
>>>>    
>>>> +/* ilk+ "12.4" interpolated format (high 10 bits) */
>>>> +static u32 ilk_lut_12p4_ldw(const struct drm_color_lut *color)
>>>> +{
>>>> +	return (color->red >> 6) << 20 | (color->green >> 6) << 10 |
>>>> +		(color->blue >> 6);
>>>> +}
>>>> +
>>>> +/* ilk+ "12.4" interpolated format (low 6 bits) */
>>>> +static u32 ilk_lut_12p4_udw(const struct drm_color_lut *color)
>>>> +{
>>>> +	return (color->red & 0x3f) << 24 | (color->green & 0x3f) << 14 |
>>>> +		(color->blue & 0x3f);
>>>> +}
>>>> +
>>>> +static void
>>>> +icl_load_gcmax(const struct intel_crtc_state *crtc_state,
>>>> +			const struct drm_color_lut *entry)
>>> Indentation looks off. Also s/entry/color/ to match the other similarish
>>> funcs maybe?
>> Sure.
>>>> +{
>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>> +	enum pipe pipe = crtc->pipe;
>>>> +
>>>> +	/* Fixme: LUT entries are 16 bit only, so we can prog 0xFFFF max */
>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 0), entry->red);
>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 1), entry->green);
>>>> +	I915_WRITE(PREC_PAL_GC_MAX(pipe, 2), entry->blue);
>>>> +}
>>>> +
>>>> +static void
>>>> +icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state)
>>>> +{
>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>>>> +	const struct drm_color_lut *lut = blob->data;
>>>> +	enum pipe pipe = crtc->pipe;
>>>> +	u32 i;
>>>> +
>>>> +	if (!lut || drm_color_lut_size(blob) < ICL_GAMMA_SUPERFINE_SEG_LENGTH)
>>>> +		return;
>>> These checks aren't needed. Just dead code.
>> Will remove this and similars.
>>>> +
>>>> +	/*
>>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>>>> +	 * segment step which is 1/(8 * 128 * 256).
>>>> +	 *
>>>> +	 * Superfine segment has 9 entries, corresponding to values
>>>> +	 * 0, 1/(8 * 128 * 256), 2/(8 * 128 * 256) .... 8/(8 * 128 * 256).
>>>> +	 */
>>>> +	I915_WRITE(PREC_PAL_MULTI_SEG_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>>>> +
>>>> +	for (i = 0; i < 9; i++) {
>>>> +		const struct drm_color_lut *entry = &lut[i];
>>>> +
>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>>>> +			   ilk_lut_12p4_udw(entry));
>>> ldw should come before udw.
>> Got it.
>>>> +		I915_WRITE(PREC_PAL_MULTI_SEG_DATA(pipe),
>>>> +			   ilk_lut_12p4_ldw(entry));
>>>> +	}
>>>> +}
>>>> +
>>>> +static void
>>>> +icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state)
>>>> +{
>>>> +	struct intel_crtc *crtc = to_intel_crtc(crtc_state->base.crtc);
>>>> +	struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
>>>> +	const struct drm_property_blob *blob = crtc_state->base.gamma_lut;
>>>> +	const struct drm_color_lut *lut = blob->data;
>>>> +	const struct drm_color_lut *entry;
>>> 'entry' declaration can be moved into the loops.
>> Its being used in multiple loops,
> IMO still better to move into the loops. We don't want to pass any
> information between the loops.
>
>> also being used for GCMax outside the
>> loop.
> Hmm. That might be an arguemnt for keeping it out. But the current
> gcmax usage looks broken. You're programming the same value into
> the last PAL_PREC index and GCMAX.

Isn't this what you wanted ? IIRC, your recommendation was to program 
the highest values user wants to program in GCMAX reg. We also had a 
discussion on how user cant program 1.0, as we have LUT depth on 16 bit 
only, and we decided to use the last value of the LUT as GCMAX. Did I 
misunderstand anything there ?

- Shashank

>>>> +	enum pipe pipe = crtc->pipe;
>>>> +	u32 i;
>>>> +
>>>> +	if (!lut || (drm_color_lut_size(blob) < ICL_GAMMA_MULTISEG_LUT_LENGTH))
>>>> +		return;
>>> More checks that aren't needed.
>> Got it.
>>>> +
>>>> +	/*
>>>> +	 * Every entry in the multi-segment LUT is corresponding to a superfine
>>>> +	 * segment step which is 1/(8*128*256).
>>>> +	 *
>>>> +	 * Fine segment's step is 1/(128 * 256) ie 1/(128 * 256),  2/(128*256)
>>>> +	 * ... 256/(128*256). So in order to program fine segment of LUT we
>>>> +	 * need to pick every 8'th entry in LUT, and program 256 indexes.
>>>> +	 * Fine segment's index 0 is programmed in HW, and it starts from
>>>> +	 * index 1.
>>> The wording here is a bit confusing. I guess the problem is what to call
>>> things. PAL_PREC_INDEX[0/1] is what we program, but that maps to the point
>>> seg2[1] with seg2[0] being unused by the hw. Well, the spec says it's
>>> implicit but IIRC I was told long ago that it's not actually used.
>>>
>>> Not sure how to word that in the best way. Maybe something like?
>>>
>>> /*
>>>    * Fine segment (seg2) ...
>>>    *
>>>    * PAL_PREC_INDEX[0] and PAL_PREC_INDEX[1] map to seg2[1],
>>>    * with seg2[0] being unused by the hardware.
>>>    */
>>>
>>> Not sure that's any clearer.
>> Ok, will try to come up with something in similar lines.
>>>> +	 */
>>>> +	I915_WRITE(PREC_PAL_INDEX(pipe), PAL_PREC_AUTO_INCREMENT);
>>>> +	for (i = 1; i < 257; i++) {
>>>> +		entry = &lut[i * 8];
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Coarse segment's starts from index 0 and it's step is 1/256 ie 0,
>>>> +	 * 1/256, 2/256 ...256/256. As per the description of each entry in LUT
>>>> +	 * above, we need to pick every 8 * 128 = 1024th entry in LUT, and
>>>> +	 * program 256 of those.
>>>> +	 */
>>> Could make a note here stating that seg3[0] and seg3[1] are also unused
>>> by the hardware, even though we have to program them to advance the
>>> index. I don't see it mentioned in the spec, but this one I definitely
>>> remember confirming from Art way back when. However I never verified
>>> that on actual hw. We could also consider just programming those two
>>> entries to 0 and start the actual coarse segment programming from index 2.
>>> Or we could skip them by reprogramming the index directly.
>> If they are not being used, does it matter what and if we program into
>> them ? We can add a note though, mentioning this.
> It shouldn't matter what we programing into them. But as mentioned I
> never actually confirmed this on actual hardware. Would be nice to
> double check that so we don't end up with incorrect comment.
>
>>>> +	for (i = 0; i < 256; i++) {
>>>> +		entry = &lut[i * 1024];
>>> s/1024/8 * 128/ maybe?
>> Sure.
>>
>> Regards
>>
>> Shashank
>>
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_udw(entry));
>>>> +		I915_WRITE(PREC_PAL_DATA(pipe), ilk_lut_12p4_ldw(entry));
>>>> +	}
>>>> +
>>>> +	icl_load_gcmax(crtc_state, entry);
>>>> +	ivb_load_lut_ext_max(crtc);
>>>> +}
>>>> +
>>>>    static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>>>>    {
>>>>    	const struct drm_property_blob *gamma_lut = crtc_state->base.gamma_lut;
>>>> @@ -775,10 +885,17 @@ static void icl_load_luts(const struct intel_crtc_state *crtc_state)
>>>>    	if (crtc_state->base.degamma_lut)
>>>>    		glk_load_degamma_lut(crtc_state);
>>>>    
>>>> -	if ((crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) ==
>>>> -	    GAMMA_MODE_MODE_8BIT) {
>>>> +	switch (crtc_state->gamma_mode & GAMMA_MODE_MODE_MASK) {
>>>> +	case GAMMA_MODE_MODE_8BIT:
>>>>    		i9xx_load_luts(crtc_state);
>>>> -	} else {
>>>> +		break;
>>>> +
>>>> +	case GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED:
>>>> +		icl_program_gamma_superfine_segment(crtc_state);
>>>> +		icl_program_gamma_multi_segment(crtc_state);
>>>> +		break;
>>>> +
>>>> +	default:
>>>>    		bdw_load_lut_10(crtc, gamma_lut, PAL_PREC_INDEX_VALUE(0));
>>>>    		ivb_load_lut_ext_max(crtc);
>>>>    	}
>>>> @@ -1209,7 +1326,7 @@ static u32 icl_gamma_mode(const struct intel_crtc_state *crtc_state)
>>>>    	    crtc_state_is_legacy_gamma(crtc_state))
>>>>    		gamma_mode |= GAMMA_MODE_MODE_8BIT;
>>>>    	else
>>>> -		gamma_mode |= GAMMA_MODE_MODE_10BIT;
>>>> +		gamma_mode |= GAMMA_MODE_MODE_12BIT_MULTI_SEGMENTED;
>>>>    
>>>>    	return gamma_mode;
>>>>    }
>>>> -- 
>>>> 2.17.1