[Intel-gfx] [RFC] tests/pm_sseu: Add subtest to verify UMD can configure render powerclock state

Wed May 3 09:43:08 UTC 2017

On 05/03/2017 08:59 AM, Chris Wilson wrote:
> On Tue, May 02, 2017 at 03:08:27PM +0000, Oscar Mateo wrote:
>> Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin at intel.com>
>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>> Signed-off-by: Oscar Mateo <oscar.mateo at intel.com>
>> ---
>>   tests/pm_sseu.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 105 insertions(+)
>>
>> diff --git a/tests/pm_sseu.c b/tests/pm_sseu.c
>> index 7d4b33c..1fb36c5 100644
>> --- a/tests/pm_sseu.c
>> +++ b/tests/pm_sseu.c
>> @@ -352,6 +352,108 @@ full_enable(void)
>>   	check_full_enable(&stat);
>>   }
>>   
>> +#define GEN8_R_PWR_CLK_STATE	(0x20C8)
>> +#define   GEN8_RPCS_ENABLE		(1 << 31)
>> +
>> +#define MI_STORE_REGISTER_MEM_64_BIT_ADDR	((0x24 << 23) | 2)
>> +
>> +static uint32_t read_pwrclk_state(drm_intel_bufmgr *bufmgr,
>> +				  struct intel_batchbuffer *batch,
>> +				  drm_intel_context *context)
>> +{
>> +	uint32_t rpcs_config;
>> +	uint32_t *data;
>> +	drm_intel_bo *dst_bo;
>> +
>> +	dst_bo = drm_intel_bo_alloc(bufmgr, "dst", 4, 4096);
>> +
>> +	BEGIN_BATCH(3, 1);
>> +	OUT_BATCH(MI_STORE_REGISTER_MEM_64_BIT_ADDR);
>> +	OUT_BATCH(GEN8_R_PWR_CLK_STATE);
>> +	OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
>> +	ADVANCE_BATCH();
>> +
>> +	intel_batchbuffer_flush_with_context(batch, context);
>> +
>> +	drm_intel_bo_map(dst_bo, 1);
>> +
>> +	data = dst_bo->virtual;
>> +	rpcs_config = *data;
>> +
>> +	drm_intel_bo_unmap(dst_bo);
>> +
>> +	drm_intel_bo_unreference(dst_bo);
>> +
>> +	return rpcs_config;
>> +}
>> +
>> +#define LOCAL_MI_LOAD_REGISTER_IMM	(0x22 << 23)
>> +
>> +#define GFX_OP_PIPE_CONTROL(len)	((0x3<<29)|(0x3<<27)|(0x2<<24)|((len)-2))
>> +#define   PIPE_CONTROL_CS_STALL				(1<<20)
>> +#define   PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH	(1<<12)
>> +#define   PIPE_CONTROL_FLUSH_ENABLE			(1<<7)
>> +#define   PIPE_CONTROL_DC_FLUSH_ENABLE			(1<<5)
>> +#define   PIPE_CONTROL_DEPTH_CACHE_FLUSH		(1<<0)
>> +
>> +static void write_pwrclk_state(drm_intel_bufmgr *bufmgr,
>> +			       struct intel_batchbuffer *batch,
>> +			       drm_intel_context *context,
>> +			       uint32_t rpcs_config)
>> +{
>> +	drm_intel_bo *dst_bo;
>> +
>> +	dst_bo = drm_intel_bo_alloc(bufmgr, "scratch", 4, 4096);
>> +
>> +	BEGIN_BATCH(9, 1);
>> +	OUT_BATCH(LOCAL_MI_LOAD_REGISTER_IMM | 1);
>> +	OUT_BATCH(GEN8_R_PWR_CLK_STATE);
>> +	OUT_BATCH(rpcs_config);
>> +	OUT_BATCH(GFX_OP_PIPE_CONTROL(6));
>> +	OUT_BATCH(PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
>> +		  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
>> +		  PIPE_CONTROL_DC_FLUSH_ENABLE |
>> +		  PIPE_CONTROL_FLUSH_ENABLE |
>> +		  PIPE_CONTROL_CS_STALL);
>> +	OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
>> +	OUT_BATCH(0);
>> +	OUT_BATCH(0);
>> +	OUT_BATCH(0);
>> +	ADVANCE_BATCH();
>> +
>> +	intel_batchbuffer_flush_with_context(batch, context);
>> +
>> +	drm_intel_bo_unreference(dst_bo);
>> +}
>> +
>> +/* Makes sure userspace can configure GEN8_R_PWR_CLK_STATE (e.g. is whitelisted) */
>> +static void
>> +pwrclk_state(void)
>> +{
>> +	drm_intel_context *context;
>> +	uint32_t rpcs_config;
>> +	bool rpcs_enabled;
>> +
>> +	/*
>> +	 * Gen8 BDW is the first case in which usermode can configure their
>> +	 * own render power gating
>> +	*/
>> +	igt_require(gem.gen >= 8);
> Pop quiz: what result does this give on kernels that do not support RPCS
> self-adjustment?
>

The hardware transforms the LRI on the privileged register into a NOOP, 
so we can detect failures to change thr value. On kernels that do not 
support RPCS self-adjustment, we hit the assert:

igt_assert_neq(rpcs_enabled, !!(rpcs_config & GEN8_RPCS_ENABLE));

> So we need some method to query the modifiable set of registers. Several
> options: an lrc version getparam, a query via context getparam for is
> this register modifiable, or for userspace to try and fail.
>
> For something like this where the implementation is best effort, trial
> and error is not a bad plan. It just means that this test cannot discern
> a failure (or we say that all previous kernels were mistaken...)
>
> Question on the whitelist themselves, I presume there is a maximum? How
> close are we to running out? (i.e. is this the best use of a precious
> whitelist slot?) Can you put that discussion into the changelog +
> comment for wa_ring_whitelist_reg.
>

We have a maximum of 12 slots per engine (RING_MAX_NONPRIV_SLOTS). Worst 
path at the moment seems to be BXT A1 where we currently use 5 slots in 
the render engine. Next are SKL and KBL (all steppings) with 4.

> How well are whitelists supported in gvt? I presume it is transparent.
> -Chris
>

I imagine it is? I don't really know.