[Intel-gfx] [PATCH v4 2/3] drm/i915: set optimum eu/slice/sub-slice configuration based on load type

Thu Mar 14 10:45:36 UTC 2019

On 14/03/2019 08:36, Ankit Navik wrote:
> From: Praveen Diwakar <praveen.diwakar at intel.com>
> 
> This patch will select optimum eu/slice/sub-slice configuration based on
> type of load (low, medium, high) as input.
> Based on our readings and experiments we have predefined set of optimum
> configuration for each platform(CHT, KBL).
> i915_gem_context_set_load_type will select optimum configuration from
> pre-defined optimum configuration table(opt_config).
> 
> It also introduce flag update_render_config which can set by any governor.
> 
> v2:
>   * Move static optimum_config to device init time.
>   * Rename function to appropriate name, fix data types and patch ordering.
>   * Rename prev_load_type to pending_load_type. (Tvrtko Ursulin)
> 
> v3:
>   * Add safe guard check in i915_gem_context_set_load_type.
>   * Rename struct from optimum_config to i915_sseu_optimum_config to
>     avoid namespace clashes.
>   * Reduces memcpy for space efficient.
>   * Rebase.
>   * Improved commit message. (Tvrtko Ursulin)
> 
> v4:
>   * Move optimum config table to file scope. (Tvrtko Ursulin)
> 
> Cc: Kedar J Karanje <kedar.j.karanje at intel.com>
> Cc: Yogesh Marathe <yogesh.marathe at intel.com>
> Signed-off-by: Praveen Diwakar <praveen.diwakar at intel.com>
> Signed-off-by: Aravindan Muthukumar <aravindan.muthukumar at intel.com>
> Signed-off-by: Ankit Navik <ankit.p.navik at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.h          |  5 ++++
>   drivers/gpu/drm/i915/i915_gem_context.c  | 20 ++++++++++++++
>   drivers/gpu/drm/i915/i915_gem_context.h  | 34 +++++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_device_info.c | 47 ++++++++++++++++++++++++++++++--
>   drivers/gpu/drm/i915/intel_lrc.c         | 45 +++++++++++++++++++++++++++++-
>   5 files changed, 148 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5c8d048..97cb36b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1593,6 +1593,11 @@ struct drm_i915_private {
>   	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES]; /* assume 965 */
>   	int num_fence_regs; /* 8 on pre-965, 16 otherwise */
>   
> +	/* optimal slice/subslice/EU configration state */
> +	struct i915_sseu_optimum_config *opt_config;
> +
> +	int predictive_load_enable;
> +
>   	unsigned int fsb_freq, mem_freq, is_ddr3;
>   	unsigned int skl_preferred_vco_freq;
>   	unsigned int max_cdclk_freq;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index a5876fe..8f16ef1 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -454,10 +454,30 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
>   
>   	trace_i915_context_create(ctx);
>   	atomic_set(&ctx->req_cnt, 0);
> +	ctx->slice_cnt = hweight8(RUNTIME_INFO(dev_priv)->sseu.slice_mask);
> +	ctx->subslice_cnt = hweight8(
> +			RUNTIME_INFO(dev_priv)->sseu.subslice_mask[0]);
> +	ctx->eu_cnt = RUNTIME_INFO(dev_priv)->sseu.eu_per_subslice;
>   
>   	return ctx;
>   }
>   
> +
> +void i915_gem_context_set_load_type(struct i915_gem_context *ctx,
> +		enum gem_load_type type)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +
> +	if (GEM_WARN_ON(type > LOAD_TYPE_LAST))
> +		return;
> +
> +	/* Call opt_config to get correct configuration for eu,slice,subslice */
> +	ctx->slice_cnt = dev_priv->opt_config[type].slice;
> +	ctx->subslice_cnt = dev_priv->opt_config[type].subslice;
> +	ctx->eu_cnt = dev_priv->opt_config[type].eu;
> +	ctx->pending_load_type = type;
> +}
> +
>   /**
>    * i915_gem_context_create_gvt - create a GVT GEM context
>    * @dev: drm device *
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index c940168..0a24d28 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -54,6 +54,19 @@ struct intel_context_ops {
>   	void (*destroy)(struct intel_context *ce);
>   };
>   
> +enum gem_load_type {
> +	LOAD_TYPE_LOW,
> +	LOAD_TYPE_MEDIUM,
> +	LOAD_TYPE_HIGH,
> +	LOAD_TYPE_LAST
> +};
> +
> +struct i915_sseu_optimum_config {
> +	u8 slice;
> +	u8 subslice;
> +	u8 eu;
> +};
> +
>   /*
>    * Powergating configuration for a particular (context,engine).
>    */
> @@ -232,6 +245,25 @@ struct i915_gem_context {
>   	 * go for low/medium/high load configuration of the GPU.
>   	 */
>   	atomic_t req_cnt;
> +
> +	/** slice_cnt: used to set the # of slices to be enabled. */
> +	u8 slice_cnt;
> +
> +	/** subslice_cnt: used to set the # of subslices to be enabled. */
> +	u8 subslice_cnt;
> +
> +	/** eu_cnt: used to set the # of eu to be enabled. */
> +	u8 eu_cnt;
> +
> +	/** load_type: The designated load_type (high/medium/low) for a given
> +	 * number of pending commands in the command queue.
> +	 */
> +	enum gem_load_type load_type;
> +
> +	/** pending_load_type: The earlier load type that the GPU was configured
> +	 * for (high/medium/low).
> +	 */
> +	enum gem_load_type pending_load_type;
>   };
>   
>   static inline bool i915_gem_context_is_closed(const struct i915_gem_context *ctx)
> @@ -375,6 +407,8 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   				    struct drm_file *file_priv);
>   int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
>   				       struct drm_file *file);
> +void i915_gem_context_set_load_type(struct i915_gem_context *ctx,
> +		enum gem_load_type type);
>   
>   struct i915_gem_context *
>   i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio);
> diff --git a/drivers/gpu/drm/i915/intel_device_info.c b/drivers/gpu/drm/i915/intel_device_info.c
> index 855a507..017a1e2 100644
> --- a/drivers/gpu/drm/i915/intel_device_info.c
> +++ b/drivers/gpu/drm/i915/intel_device_info.c
> @@ -707,6 +707,27 @@ static u32 read_timestamp_frequency(struct drm_i915_private *dev_priv)
>   	return 0;
>   }
>   
> +/* static table of slice/subslice/EU for Cherryview */
> +static const struct i915_sseu_optimum_config chv_config[LOAD_TYPE_LAST] = {
> +	{1, 1, 4},	/* Low */
> +	{1, 1, 6},	/* Medium */
> +	{1, 2, 6}	/* High */
> +};
> +
> +/* static table of slice/subslice/EU for KBL GT2 */
> +static const struct i915_sseu_optimum_config kbl_gt2_config[LOAD_TYPE_LAST] = {
> +	{1, 3, 2},	/* Low */
> +	{1, 3, 4},	/* Medium */
> +	{1, 3, 8}	/* High */
> +};
> +
> +/* static table of slice/subslice/EU for KBL GT3 */
> +static const struct i915_sseu_optimum_config kbl_gt3_config[LOAD_TYPE_LAST] = {
> +	{2, 3, 4},	/* Low */
> +	{2, 3, 6},	/* Medium */
> +	{2, 3, 8}	/* High */
> +};
> +
>   /**
>    * intel_device_info_runtime_init - initialize runtime info
>    * @dev_priv: the i915 device
> @@ -728,6 +749,7 @@ void intel_device_info_runtime_init(struct drm_i915_private *dev_priv)
>   	struct intel_device_info *info = mkwrite_device_info(dev_priv);
>   	struct intel_runtime_info *runtime = RUNTIME_INFO(dev_priv);
>   	enum pipe pipe;
> +	struct i915_sseu_optimum_config *opt_config = NULL;
>   
>   	if (INTEL_GEN(dev_priv) >= 10) {
>   		for_each_pipe(dev_priv, pipe)
> @@ -831,12 +853,30 @@ void intel_device_info_runtime_init(struct drm_i915_private *dev_priv)
>   	/* Initialize slice/subslice/EU info */
>   	if (IS_HASWELL(dev_priv))
>   		haswell_sseu_info_init(dev_priv);
> -	else if (IS_CHERRYVIEW(dev_priv))
> +	else if (IS_CHERRYVIEW(dev_priv)) {
>   		cherryview_sseu_info_init(dev_priv);
> +		opt_config = chv_config;
> +		BUILD_BUG_ON(ARRAY_SIZE(chv_config) != LOAD_TYPE_LAST);
> +	}
>   	else if (IS_BROADWELL(dev_priv))
>   		broadwell_sseu_info_init(dev_priv);
> -	else if (IS_GEN(dev_priv, 9))
> +	else if (IS_GEN(dev_priv, 9)) {
>   		gen9_sseu_info_init(dev_priv);
> +
> +		switch (info->gt) {
> +		default: /* fall through */
> +		case 2:
> +			opt_config = kbl_gt2_config;
> +			BUILD_BUG_ON(ARRAY_SIZE(kbl_gt2_config)
> +						!= LOAD_TYPE_LAST);
> +		break;
> +		case 3:
> +			opt_config = kbl_gt3_config;
> +			BUILD_BUG_ON(ARRAY_SIZE(kbl_gt3_config)
> +						!= LOAD_TYPE_LAST);
> +		break;
> +		}
> +	}
>   	else if (IS_GEN(dev_priv, 10))
>   		gen10_sseu_info_init(dev_priv);
>   	else if (INTEL_GEN(dev_priv) >= 11)
> @@ -847,6 +887,9 @@ void intel_device_info_runtime_init(struct drm_i915_private *dev_priv)
>   		info->ppgtt = INTEL_PPGTT_NONE;
>   	}
>   
> +	if (opt_config)
> +		dev_priv->opt_config = opt_config;
> +
>   	/* Initialize command stream timestamp frequency */
>   	runtime->cs_timestamp_frequency_khz = read_timestamp_frequency(dev_priv);
>   }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d0af37d..397af1e 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1282,6 +1282,35 @@ static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
>   	return i915_vma_pin(vma, 0, 0, flags);
>   }
>   
> +static u32
> +get_context_rpcs_config(struct i915_gem_context *ctx)
> +{
> +	u32 rpcs = 0;
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +
> +	if (INTEL_GEN(dev_priv) < 8)
> +		return 0;
> +
> +	if (RUNTIME_INFO(dev_priv)->sseu.has_slice_pg) {
> +		rpcs |= GEN8_RPCS_S_CNT_ENABLE;
> +		rpcs |= ctx->slice_cnt << GEN8_RPCS_S_CNT_SHIFT;
> +		rpcs |= GEN8_RPCS_ENABLE;
> +	}
> +
> +	if (RUNTIME_INFO(dev_priv)->sseu.has_subslice_pg) {
> +		rpcs |= GEN8_RPCS_SS_CNT_ENABLE;
> +		rpcs |= ctx->subslice_cnt << GEN8_RPCS_SS_CNT_SHIFT;
> +		rpcs |= GEN8_RPCS_ENABLE;
> +	}
> +
> +	if (RUNTIME_INFO(dev_priv)->sseu.has_eu_pg) {
> +		rpcs |= ctx->eu_cnt << GEN8_RPCS_EU_MIN_SHIFT;
> +		rpcs |= ctx->eu_cnt << GEN8_RPCS_EU_MAX_SHIFT;
> +		rpcs |= GEN8_RPCS_ENABLE;
> +	}
> +
> +	return rpcs;
> +}
>   static void
>   __execlists_update_reg_state(struct intel_engine_cs *engine,
>   			     struct intel_context *ce)
> @@ -1294,9 +1323,20 @@ __execlists_update_reg_state(struct intel_engine_cs *engine,
>   	regs[CTX_RING_TAIL + 1] = ring->tail;
>   
>   	/* RPCS */
> -	if (engine->class == RENDER_CLASS)
> +	if (engine->class == RENDER_CLASS &&
> +				engine->i915->predictive_load_enable) {
> +		u32 rpcs_config = 0;
> +		struct i915_gem_context *ctx = ce->gem_context;
> +
> +		rpcs_config = get_context_rpcs_config(ctx);
> +		regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
> +		CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
> +						rpcs_config);
> +
> +	} else if (engine->class == RENDER_CLASS) {

>   		regs[CTX_R_PWR_CLK_STATE + 1] = gen8_make_rpcs(engine->i915,
>   							       &ce->sseu);

We can't have to places/paths controlling RPCS.

What happens when both OA and your feature are enabled? (See comments in 
gen8_make_rpcs.)

What happens when media has requested reduces subslice count on Gen11 if 
predictive RPCS is added on Gen11?

You need to put the logic into gen8_make_rpcs and handle all those 
overrides.

Regards,

Tvrtko

> +	}
>   }
>   
>   static struct intel_context *
> @@ -1340,6 +1380,9 @@ __execlists_context_pin(struct intel_engine_cs *engine,
>   
>   	__execlists_update_reg_state(engine, ce);
>   
> +	if (ctx->load_type != ctx->pending_load_type)
> +		ctx->load_type = ctx->pending_load_type;
> +
>   	ce->state->obj->pin_global++;
>   	i915_gem_context_get(ctx);
>   	return ce;
>