[Intel-gfx] [RFC PATCH 4/4] drm/i915: reprogram NOA muxes on context switch when using perf

Wed Aug 30 19:33:37 UTC 2017

On 30/08/17 20:15, Chris Wilson wrote:
> Quoting Lionel Landwerlin (2017-08-30 19:20:06)
>> If some of the contexts submitting workloads to the GPU have been
>> configured to shutdown slices/subslices, we might loose the NOA
>> configurations written in the NOA muxes. We need to reprogram then at
>> context switch.
>>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h  |  2 ++
>>   drivers/gpu/drm/i915/i915_perf.c | 77 ++++++++++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/intel_lrc.c | 64 ++++++++++++++++++++++++++++++---
>>   drivers/gpu/drm/i915/intel_lrc.h |  1 +
>>   4 files changed, 140 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 0003b46b6840..d4b3e5da9009 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -3685,6 +3685,8 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data,
>>   void i915_oa_init_reg_state(struct intel_engine_cs *engine,
>>                              struct i915_gem_context *ctx,
>>                              uint32_t *reg_state);
>> +u32 i915_oa_get_perctx_bb_size(struct drm_i915_private *dev_priv);
>> +u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch);
>>   
>>   /* i915_gem_evict.c */
>>   int __must_check i915_gem_evict_something(struct i915_address_space *vm,
>> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
>> index 94185d610673..b74ffbb47879 100644
>> --- a/drivers/gpu/drm/i915/i915_perf.c
>> +++ b/drivers/gpu/drm/i915/i915_perf.c
>> @@ -1687,6 +1687,74 @@ static int gen8_emit_oa_config(struct drm_i915_gem_request *req,
>>          return 0;
>>   }
>>   
>> +#define MAX_LRI_SIZE (125U)
>> +
>> +u32 i915_oa_get_perctx_bb_size(struct drm_i915_private *dev_priv)
>> +{
>> +       struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
>> +
>> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
> Still not happy by this coupling to struct_mutex. :-p
>
> Missed RCS check.
>
>> +
>> +       /* Perf not supported. */
>> +       if (!dev_priv->perf.initialized)
>> +               return 0;
>> +
>> +       /* OA not currently configured. */
>> +       if (!stream)
>> +               return 0;
>> +
>> +       /* Very unlikely but possible that we have no muxes to configure. */
>> +       if (!stream->oa_config->mux_regs_len)
>> +               return 0;
>> +
>> +       /* Return the size of MI_LOAD_REGISTER_IMMs. */
>> +       return (stream->oa_config->mux_regs_len / MAX_LRI_SIZE) * 4 + 4 +
>> +               stream->oa_config->mux_regs_len * 8;
>> +}
>> +
>> +u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
>> +{
>> +       struct drm_i915_private *dev_priv = engine->i915;
>> +       struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
>> +       u32 n_lri, n_mux_regs;
>> +       u32 i;
>> +
>> +       lockdep_assert_held(&dev_priv->drm.struct_mutex);
>> +
>> +       /* We only care about RCS. */
>> +       if (engine->id != RCS)
>> +               return batch;
>> +
>> +       /* Perf not supported. */
>> +       if (!dev_priv->perf.initialized)
>> +               return batch;
>> +
>> +       /* OA not currently configured. */
>> +       if (!stream)
>> +               return batch;
>> +
>> +       /* It's very unlikely, but possible that we're dealing with a config
>> +        * with no mux to configure.
>> +        */
>> +       if (!stream->oa_config->mux_regs_len)
>> +               return batch;
> The above could be condensed into
> if (i915_oa_get_perctx_bb_size() == 0)
> 	return;
>
>> +
>> +       n_mux_regs = stream->oa_config->mux_regs_len;
>> +       n_lri = (n_mux_regs / MAX_LRI_SIZE) + (n_mux_regs % MAX_LRI_SIZE) != 0;
>> +
>> +       for (i = 0; i < n_mux_regs; i++) {
>> +               if ((i % MAX_LRI_SIZE) == 0) {
>> +                       n_lri = min(n_mux_regs - i, MAX_LRI_SIZE);
>> +                       *batch++ = MI_LOAD_REGISTER_IMM(n_lri);
>> +               }
>> +
>> +               *batch++ = i915_mmio_reg_offset(stream->oa_config->mux_regs[i].addr);
>> +               *batch++ = stream->oa_config->mux_regs[i].value;
>> +       }
> I would have personally used a double loop. But at least kill that first
> n_lri, that was a moment of confusion spent trying to work out what you
> were using it for.
>
>> +
>> +       return batch;
>> +}
>>   /**
>>    * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
>> @@ -1055,6 +1057,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
>>    */
>>   static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
>>   {
>> +       batch = i915_oa_emit_perctx_bb(engine, batch);
>> +
>>          /* WaDisableCtxRestoreArbitration:bdw,chv */
>>          *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>>          *batch++ = MI_BATCH_BUFFER_END;
>> @@ -1118,21 +1122,27 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
>>   
>>   static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
>>   {
>> +       batch = i915_oa_emit_perctx_bb(engine, batch);
> Wrong wa_bb. This is emitted at the start of every bb, you want
> indirectctx_bb which is emitted after a context switch. Or at least I
> hope you don't need such a heavy handed approach. If you do, can you
> convince me that you are really not in conflict with the application?
> (Earlier you said it only needs ctx switch.)

Thanks, I'll rerun my tests with indirectctx_bb.

>
>> +int logical_render_ring_reload_wa_bb(struct intel_engine_cs *engine)
> intel_lrc_update_wa_bb().
>
>> +{
>> +       struct drm_i915_private *dev_priv = engine->i915;
>> +       struct i915_ctx_workarounds new_wa_ctx;
>> +       struct i915_gem_context *ctx;
>> +       int ret;
>> +
>> +       if (WARN_ON(engine->id != RCS))
>> +               return -EINVAL;
>> +
>> +       memset(&new_wa_ctx, 0, sizeof(new_wa_ctx));
>> +       ret = intel_init_workaround_bb(engine, &new_wa_ctx);
>> +       if (ret)
>> +               return ret;
>> +
>> +       if (engine->wa_ctx.vma)
>> +               lrc_destroy_wa_ctx(engine);
> Couldn't we at least try to reuse the existing vma first?
>