[PATCH v2] drm/i915/guc: Use context hints for GT freq

Mon Mar 4 21:39:05 UTC 2024

On Wed, Feb 28, 2024 at 11:58:06AM -0800, Belgaumkar, Vinay wrote:
> 
> On 2/28/2024 4:54 AM, Tvrtko Ursulin wrote:
> > 
> > On 27/02/2024 23:51, Vinay Belgaumkar wrote:
> > > Allow user to provide a low latency context hint. When set, KMD
> > > sends a hint to GuC which results in special handling for this
> > > context. SLPC will ramp the GT frequency aggressively every time
> > > it switches to this context. The down freq threshold will also be
> > > lower so GuC will ramp down the GT freq for this context more slowly.
> > > We also disable waitboost for this context as that will interfere with
> > > the strategy.
> > > 
> > > We need to enable the use of SLPC Compute strategy during init, but
> > > it will apply only to contexts that set this bit during context
> > > creation.
> > > 
> > > Userland can check whether this feature is supported using a new param-
> > > I915_PARAM_HAS_CONTEXT_FREQ_HINTS. This flag is true for all guc
> > > submission
> > > enabled platforms as they use SLPC for frequency management.
> > > 
> > > The Mesa usage model for this flag is here -
> > > https://gitlab.freedesktop.org/sushmave/mesa/-/commits/compute_hint
> > > 
> > > v2: Rename flags as per review suggestions (Rodrigo, Tvrtko).
> > > Also, use flag bits in intel_context as it allows finer control for
> > > toggling per engine if needed (Tvrtko).
> > > 
> > > Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
> > > Cc: Sushma Venkatesh Reddy <sushma.venkatesh.reddy at intel.com>
> > > Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>
> > > ---
> > >   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 15 +++++++++++--
> > >   .../gpu/drm/i915/gem/i915_gem_context_types.h |  1 +
> > >   drivers/gpu/drm/i915/gt/intel_context_types.h |  1 +
> > >   drivers/gpu/drm/i915/gt/intel_rps.c           |  5 +++++
> > >   .../drm/i915/gt/uc/abi/guc_actions_slpc_abi.h | 21 +++++++++++++++++++
> > >   drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c   | 17 +++++++++++++++
> > >   drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h   |  1 +
> > >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  6 ++++++
> > >   drivers/gpu/drm/i915/i915_getparam.c          | 12 +++++++++++
> > >   include/uapi/drm/i915_drm.h                   | 15 +++++++++++++
> > >   10 files changed, 92 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > index dcbfe32fd30c..0799cb0b2803 100644
> > > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > @@ -879,6 +879,7 @@ static int set_proto_ctx_param(struct
> > > drm_i915_file_private *fpriv,
> > >                      struct i915_gem_proto_context *pc,
> > >                      struct drm_i915_gem_context_param *args)
> > >   {
> > > +    struct drm_i915_private *i915 = fpriv->i915;
> > >       int ret = 0;
> > >         switch (args->param) {
> > > @@ -904,6 +905,13 @@ static int set_proto_ctx_param(struct
> > > drm_i915_file_private *fpriv,
> > >               pc->user_flags &= ~BIT(UCONTEXT_BANNABLE);
> > >           break;
> > >   +    case I915_CONTEXT_PARAM_LOW_LATENCY:
> > > +        if (intel_uc_uses_guc_submission(&to_gt(i915)->uc))
> > > +            pc->user_flags |= BIT(UCONTEXT_LOW_LATENCY);
> > > +        else
> > > +            ret = -EINVAL;
> > > +        break;
> > > +
> > >       case I915_CONTEXT_PARAM_RECOVERABLE:
> > >           if (args->size)
> > >               ret = -EINVAL;
> > > @@ -992,6 +1000,9 @@ static int intel_context_set_gem(struct
> > > intel_context *ce,
> > >       if (sseu.slice_mask && !WARN_ON(ce->engine->class !=
> > > RENDER_CLASS))
> > >           ret = intel_context_reconfigure_sseu(ce, sseu);
> > >   +    if (test_bit(UCONTEXT_LOW_LATENCY, &ctx->user_flags))
> > > +        set_bit(CONTEXT_LOW_LATENCY, &ce->flags);
> > 
> > Does not need to be atomic so can use __set_bit as higher up in the
> > function.
> ok.
> > 
> > > +
> > >       return ret;
> > >   }
> > >   @@ -1630,6 +1641,8 @@ i915_gem_create_context(struct
> > > drm_i915_private *i915,
> > >       if (vm)
> > >           ctx->vm = vm;
> > >   +    ctx->user_flags = pc->user_flags;
> > > +
> > 
> > Given how most ctx->something assignments are at the bottom of the
> > function I would stick a comment here saying along the lines of "assign
> > early for intel_context_set_gem called when creating engines".
> ok.
> > 
> > > mutex_init(&ctx->engines_mutex);
> > >       if (pc->num_user_engines >= 0) {
> > >           i915_gem_context_set_user_engines(ctx);
> > > @@ -1652,8 +1665,6 @@ i915_gem_create_context(struct
> > > drm_i915_private *i915,
> > >        * is no remap info, it will be a NOP. */
> > >       ctx->remap_slice = ALL_L3_SLICES(i915);
> > >   -    ctx->user_flags = pc->user_flags;
> > > -
> > >       for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
> > >           ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
> > >   diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > > b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > > index 03bc7f9d191b..b6d97da63d1f 100644
> > > --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> > > @@ -338,6 +338,7 @@ struct i915_gem_context {
> > >   #define UCONTEXT_BANNABLE        2
> > >   #define UCONTEXT_RECOVERABLE        3
> > >   #define UCONTEXT_PERSISTENCE        4
> > > +#define UCONTEXT_LOW_LATENCY        5
> > >         /**
> > >        * @flags: small set of booleans
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > index 7eccbd70d89f..ed95a7b57cbb 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > @@ -130,6 +130,7 @@ struct intel_context {
> > >   #define CONTEXT_PERMA_PIN        11
> > >   #define CONTEXT_IS_PARKING        12
> > >   #define CONTEXT_EXITING            13
> > > +#define CONTEXT_LOW_LATENCY        14
> > >         struct {
> > >           u64 timeout_us;
> > > diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c
> > > b/drivers/gpu/drm/i915/gt/intel_rps.c
> > > index 4feef874e6d6..3accfd8db287 100644
> > > --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> > > +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> > > @@ -24,6 +24,7 @@
> > >   #include "intel_pcode.h"
> > >   #include "intel_rps.h"
> > >   #include "vlv_sideband.h"
> > > +#include "../gem/i915_gem_context.h"
> > 
> > I wouldn't think this needs the directory backtrack. Please double-check
> > and remove if not needed.
> > 
> > Also please group the GEM include on it's own before the gt block.
> Don't think this is needed. Removing.
> > 
> > 
> > >   #include "../../../platform/x86/intel_ips.h"
> > >     #define BUSY_MAX_EI    20u /* ms */
> > > @@ -1013,6 +1014,10 @@ void intel_rps_boost(struct i915_request *rq)
> > >       if (i915_request_signaled(rq) || i915_request_has_waitboost(rq))
> > >           return;
> > >   +    /* Waitboost is not needed for contexts marked with a Freq
> > > hint */
> > > +    if (test_bit(CONTEXT_LOW_LATENCY, &rq->context->flags))
> > > +        return;
> > 
> > I understood the plan was to add a separate "no waitboost" flag which
> > Rodrigo would need for later? Could be done later too.
> This flag is as good, I guess. If we plan to add more flags, we could add a
> no_waitboost assuming they will be a mix of some that warrant waitboost and
> some that don't.

I believe the current way Vinay implemented is the right way to go.

Well, we could have 2 flags already but we would need to force user
space to only use CONTEXT_LOW_LATENCY if also using CONTEXT_NO_WAITBOOST,
because this compute strategy would be meaningless with waitboost enabled.

But then, the other case that we have in mind is media case where we
would need to disable waitboost as well... but even on that case, it
looks like we might want the waitboost disabled with yet another slpc
strategy, then we would need to force media to also go with 2 flags.

So, for the other case it will be likely better to have something like
CONTEXT_BACKGROUND where we also disable waitboost and set the
ignore_business slpc strategy.

so, on this patch as is:

Acked-by: Rodrigo Vivi <rodrigo.vivi at intel.com>

> > 
> > > +
> > >       /* Serializes with i915_request_retire() */
> > >       if (!test_and_set_bit(I915_FENCE_FLAG_BOOST, &rq->fence.flags)) {
> > >           struct intel_rps *rps = &READ_ONCE(rq->engine)->gt->rps;
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
> > > b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
> > > index 811add10c30d..c34674e797c6 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
> > > +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
> > > @@ -207,6 +207,27 @@ struct slpc_shared_data {
> > >       u8 reserved_mode_definition[4096];
> > >   } __packed;
> > >   +struct slpc_context_frequency_request {
> > > +    u32 frequency_request:16;
> > > +    u32 reserved:12;
> > > +    u32 is_compute:1;
> > > +    u32 ignore_busyness:1;
> > > +    u32 is_minimum:1;
> > > +    u32 is_predefined:1;
> > > +} __packed;
> > > +
> > > +#define SLPC_CTX_FREQ_REQ_IS_COMPUTE        REG_BIT(28)
> > > +
> > > +struct slpc_optimized_strategies {
> > > +    u32 compute:1;
> > > +    u32 async_flip:1;
> > > +    u32 media:1;
> > > +    u32 vsync_flip:1;
> > > +    u32 reserved:28;
> > > +} __packed;
> > > +
> > > +#define SLPC_OPTIMIZED_STRATEGY_COMPUTE        REG_BIT(0)
> > > +
> > >   /**
> > >    * DOC: SLPC H2G MESSAGE FORMAT
> > >    *
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
> > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
> > > index 3e681ab6fbf9..706fffca698b 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
> > > @@ -537,6 +537,20 @@ int intel_guc_slpc_get_min_freq(struct
> > > intel_guc_slpc *slpc, u32 *val)
> > >       return ret;
> > >   }
> > >   +int intel_guc_slpc_set_strategy(struct intel_guc_slpc *slpc, u32 val)
> > > +{
> > > +    struct drm_i915_private *i915 = slpc_to_i915(slpc);
> > > +    intel_wakeref_t wakeref;
> > > +    int ret = 0;
> > > +
> > > +    with_intel_runtime_pm(&i915->runtime_pm, wakeref)
> > > +        ret = slpc_set_param(slpc,
> > > +                     SLPC_PARAM_STRATEGIES,
> > > +                     val);
> > > +
> > > +    return ret;
> > > +}
> > > +
> > >   int intel_guc_slpc_set_media_ratio_mode(struct intel_guc_slpc
> > > *slpc, u32 val)
> > >   {
> > >       struct drm_i915_private *i915 = slpc_to_i915(slpc);
> > > @@ -711,6 +725,9 @@ int intel_guc_slpc_enable(struct intel_guc_slpc
> > > *slpc)
> > >       /* Set cached media freq ratio mode */
> > >       intel_guc_slpc_set_media_ratio_mode(slpc, slpc->media_ratio_mode);
> > >   +    /* Enable SLPC Optimized Strategy for compute */
> > > +    intel_guc_slpc_set_strategy(slpc, SLPC_OPTIMIZED_STRATEGY_COMPUTE);
> > > +
> > >       return 0;
> > >   }
> > >   diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
> > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
> > > index 6ac6503c39d4..1cb5fd44f05c 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
> > > @@ -45,5 +45,6 @@ void intel_guc_pm_intrmsk_enable(struct intel_gt *gt);
> > >   void intel_guc_slpc_boost(struct intel_guc_slpc *slpc);
> > >   void intel_guc_slpc_dec_waiters(struct intel_guc_slpc *slpc);
> > >   int intel_guc_slpc_set_ignore_eff_freq(struct intel_guc_slpc
> > > *slpc, bool val);
> > > +int intel_guc_slpc_set_strategy(struct intel_guc_slpc *slpc, u32 val);
> > >     #endif
> > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > index f3dcae4b9d45..d6ea4898ac39 100644
> > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > @@ -2645,6 +2645,7 @@ MAKE_CONTEXT_POLICY_ADD(execution_quantum,
> > > EXECUTION_QUANTUM)
> > >   MAKE_CONTEXT_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT)
> > >   MAKE_CONTEXT_POLICY_ADD(priority, SCHEDULING_PRIORITY)
> > >   MAKE_CONTEXT_POLICY_ADD(preempt_to_idle,
> > > PREEMPT_TO_IDLE_ON_QUANTUM_EXPIRY)
> > > +MAKE_CONTEXT_POLICY_ADD(slpc_ctx_freq_req, SLPM_GT_FREQUENCY)
> > >     #undef MAKE_CONTEXT_POLICY_ADD
> > >   @@ -2664,6 +2665,7 @@ static int
> > > guc_context_policy_init_v70(struct intel_context *ce, bool loop)
> > >       struct context_policy policy;
> > >       u32 execution_quantum;
> > >       u32 preemption_timeout;
> > > +    u32 slpc_ctx_freq_req = 0;
> > >       unsigned long flags;
> > >       int ret;
> > >   @@ -2675,11 +2677,15 @@ static int
> > > guc_context_policy_init_v70(struct intel_context *ce, bool loop)
> > >       execution_quantum = engine->props.timeslice_duration_ms * 1000;
> > >       preemption_timeout = engine->props.preempt_timeout_ms * 1000;
> > >   +    if (ce && (ce->flags & BIT(CONTEXT_LOW_LATENCY)))
> > > +        slpc_ctx_freq_req |= SLPC_CTX_FREQ_REQ_IS_COMPUTE;
> > > +
> > >       __guc_context_policy_start_klv(&policy, ce->guc_id.id);
> > >         __guc_context_policy_add_priority(&policy, ce->guc_state.prio);
> > >       __guc_context_policy_add_execution_quantum(&policy,
> > > execution_quantum);
> > >       __guc_context_policy_add_preemption_timeout(&policy,
> > > preemption_timeout);
> > > +    __guc_context_policy_add_slpc_ctx_freq_req(&policy,
> > > slpc_ctx_freq_req);
> > >         if (engine->flags & I915_ENGINE_WANT_FORCED_PREEMPTION)
> > >           __guc_context_policy_add_preempt_to_idle(&policy, 1);
> > > diff --git a/drivers/gpu/drm/i915/i915_getparam.c
> > > b/drivers/gpu/drm/i915/i915_getparam.c
> > > index 5c3fec63cb4c..4fc182fdb2e8 100644
> > > --- a/drivers/gpu/drm/i915/i915_getparam.c
> > > +++ b/drivers/gpu/drm/i915/i915_getparam.c
> > > @@ -155,6 +155,18 @@ int i915_getparam_ioctl(struct drm_device *dev,
> > > void *data,
> > >            */
> > >           value = 1;
> > >           break;
> > > +    case I915_PARAM_HAS_CONTEXT_FREQ_HINT:
> > > +        /*
> > > +         * This feature has been available in GuC for a while but
> > > +         * a use case now required the use of this feature. We
> > > +         * return true now since this is now being supported from
> > > +         * the kernel side as well.
> > > +         */
> > 
> > TBH I don't see why the comment text is useful? The condition below
> > already makes it clear the thing is always supported when GuC is on.
> ok, will remove.
> > 
> > > +        if (intel_uc_uses_guc_submission(&to_gt(i915)->uc))
> > > +            value = 1;
> > > +        else
> > > +            value = -EINVAL;
> > > +        break;
> > >       case I915_PARAM_HAS_CONTEXT_ISOLATION:
> > >           value = intel_engines_has_context_isolation(i915);
> > >           break;
> > > diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> > > index 2ee338860b7e..4f69f4e117ac 100644
> > > --- a/include/uapi/drm/i915_drm.h
> > > +++ b/include/uapi/drm/i915_drm.h
> > > @@ -806,6 +806,12 @@ typedef struct drm_i915_irq_wait {
> > >    */
> > >   #define I915_PARAM_PXP_STATUS         58
> > >   +/*
> > > + * Query if kernel allows marking a context to send a Freq hint to
> > > SLPC. This
> > > + * will enable use of the strategies allowed by the SLPC algorithm.
> > > + */
> > > +#define I915_PARAM_HAS_CONTEXT_FREQ_HINT    59
> > > +
> > >   /* Must be kept compact -- no holes and well documented */
> > >     /**
> > > @@ -2148,6 +2154,15 @@ struct drm_i915_gem_context_param {
> > >    * -EIO: The firmware did not succeed in creating the protected
> > > context.
> > >    */
> > >   #define I915_CONTEXT_PARAM_PROTECTED_CONTENT    0xd
> > > +
> > > +/*
> > > + * I915_CONTEXT_PARAM_IS_COMPUTE:
> > > + *
> > > + * Mark this context as a Compute related workload which requires
> > > aggressive GT
> > > + * frequency scaling. Query I915_PARAM_HAS_CONTEXT_COMPUTE to check
> > > if the kernel
> > > + * supports this functionality.
> > > + */
> > > +#define I915_CONTEXT_PARAM_LOW_LATENCY        0xe
> > >   /* Must be kept compact -- no holes and well documented */
> > >         /** @value: Context parameter value to be set or queried */
> > 
> > Modulo the comments I left and the SLPC/GuC parts I haven't looked at,
> > the implementation on the overall looks fine to me.
> > 
> > For the uapi angle I am not really happy that it is steamrolled to be a
> > GEM context level concept, apparently without much motivation to discuss
> > and document the pros and cons. But I guess finer grained control could
> > be added later if need is confirmed so it is not a huge problem.
> 
> Thanks for the review,
> 
> Vinay.
> 
> > 
> > 
> > Regards,
> > 
> > Tvrtko