[Intel-gfx] [PATCH v2 1/3] drm/i915/guc: Limit scheduling properties to avoid overflow

Wed Mar 2 18:22:16 UTC 2022

On 3/2/2022 01:49, Tvrtko Ursulin wrote:
> On 25/02/2022 20:41, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> GuC converts the pre-emption timeout and timeslice quantum values into
>> clock ticks internally. That significantly reduces the point of 32bit
>> overflow. On current platforms, worst case scenario is approximately
>> 110 seconds. Rather than allowing the user to set higher values and
>> then get confused by early timeouts, add limits when setting these
>> values.
>>
>> v2: Add helper functins for clamping (review feedback from Tvrtko).
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com> 
>> (v1)
>> ---
>>   drivers/gpu/drm/i915/gt/intel_engine.h      |  6 ++
>>   drivers/gpu/drm/i915/gt/intel_engine_cs.c   | 69 +++++++++++++++++++++
>>   drivers/gpu/drm/i915/gt/sysfs_engines.c     | 25 +++++---
>>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h |  9 +++
>>   4 files changed, 99 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h 
>> b/drivers/gpu/drm/i915/gt/intel_engine.h
>> index be4b1e65442f..5a9186f784c4 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
>> @@ -349,4 +349,10 @@ intel_engine_get_hung_context(struct 
>> intel_engine_cs *engine)
>>       return engine->hung_ce;
>>   }
>>   +u64 intel_clamp_heartbeat_interval_ms(struct intel_engine_cs 
>> *engine, u64 value);
>> +u64 intel_clamp_max_busywait_duration_ns(struct intel_engine_cs 
>> *engine, u64 value);
>> +u64 intel_clamp_preempt_timeout_ms(struct intel_engine_cs *engine, 
>> u64 value);
>> +u64 intel_clamp_stop_timeout_ms(struct intel_engine_cs *engine, u64 
>> value);
>> +u64 intel_clamp_timeslice_duration_ms(struct intel_engine_cs 
>> *engine, u64 value);
>> +
>>   #endif /* _INTEL_RINGBUFFER_H_ */
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
>> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> index e855c801ba28..7ad9e6006656 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> @@ -399,6 +399,26 @@ static int intel_engine_setup(struct intel_gt 
>> *gt, enum intel_engine_id id,
>>       if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS)
>>           engine->props.preempt_timeout_ms = 0;
>>   +    /* Cap properties according to any system limits */
>> +#define CLAMP_PROP(field) \
>> +    do { \
>> +        u64 clamp = intel_clamp_##field(engine, engine->props.field); \
>> +        if (clamp != engine->props.field) { \
>> +            drm_notice(&engine->i915->drm, \
>> +                   "Warning, clamping %s to %lld to prevent 
>> overflow\n", \
>> +                   #field, clamp); \
>> +            engine->props.field = clamp; \
>> +        } \
>> +    } while (0)
>> +
>> +    CLAMP_PROP(heartbeat_interval_ms);
>> +    CLAMP_PROP(max_busywait_duration_ns);
>> +    CLAMP_PROP(preempt_timeout_ms);
>> +    CLAMP_PROP(stop_timeout_ms);
>> +    CLAMP_PROP(timeslice_duration_ms);
>> +
>> +#undef CLAMP_PROP
>> +
>>       engine->defaults = engine->props; /* never to change again */
>>         engine->context_size = intel_engine_context_size(gt, 
>> engine->class);
>> @@ -421,6 +441,55 @@ static int intel_engine_setup(struct intel_gt 
>> *gt, enum intel_engine_id id,
>>       return 0;
>>   }
>>   +u64 intel_clamp_heartbeat_interval_ms(struct intel_engine_cs 
>> *engine, u64 value)
>> +{
>> +    value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
>> +
>> +    return value;
>> +}
>> +
>> +u64 intel_clamp_max_busywait_duration_ns(struct intel_engine_cs 
>> *engine, u64 value)
>> +{
>> +    value = min(value, jiffies_to_nsecs(2));
>> +
>> +    return value;
>> +}
>> +
>> +u64 intel_clamp_preempt_timeout_ms(struct intel_engine_cs *engine, 
>> u64 value)
>> +{
>> +    /*
>> +     * NB: The GuC API only supports 32bit values. However, the 
>> limit is further
>> +     * reduced due to internal calculations which would otherwise 
>> overflow.
>> +     */
>> +    if (intel_guc_submission_is_wanted(&engine->gt->uc.guc))
>> +        value = min_t(u64, value, GUC_POLICY_MAX_PREEMPT_TIMEOUT_MS);
>> +
>> +    value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
>> +
>> +    return value;
>> +}
>> +
>> +u64 intel_clamp_stop_timeout_ms(struct intel_engine_cs *engine, u64 
>> value)
>> +{
>> +    value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
>> +
>> +    return value;
>> +}
>> +
>> +u64 intel_clamp_timeslice_duration_ms(struct intel_engine_cs 
>> *engine, u64 value)
>> +{
>> +    /*
>> +     * NB: The GuC API only supports 32bit values. However, the 
>> limit is further
>> +     * reduced due to internal calculations which would otherwise 
>> overflow.
>> +     */
>> +    if (intel_guc_submission_is_wanted(&engine->gt->uc.guc))
>> +        value = min_t(u64, value, GUC_POLICY_MAX_EXEC_QUANTUM_MS);
>> +
>> +    value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
>> +
>> +    return value;
>> +}
>> +
>>   static void __setup_engine_capabilities(struct intel_engine_cs 
>> *engine)
>>   {
>>       struct drm_i915_private *i915 = engine->i915;
>> diff --git a/drivers/gpu/drm/i915/gt/sysfs_engines.c 
>> b/drivers/gpu/drm/i915/gt/sysfs_engines.c
>> index 967031056202..f2d9858d827c 100644
>> --- a/drivers/gpu/drm/i915/gt/sysfs_engines.c
>> +++ b/drivers/gpu/drm/i915/gt/sysfs_engines.c
>> @@ -144,7 +144,7 @@ max_spin_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>              const char *buf, size_t count)
>>   {
>>       struct intel_engine_cs *engine = kobj_to_engine(kobj);
>> -    unsigned long long duration;
>> +    unsigned long long duration, clamped;
>>       int err;
>>         /*
>> @@ -168,7 +168,8 @@ max_spin_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>       if (err)
>>           return err;
>>   -    if (duration > jiffies_to_nsecs(2))
>> +    clamped = intel_clamp_max_busywait_duration_ns(engine, duration);
>> +    if (duration != clamped)
>>           return -EINVAL;
>>         WRITE_ONCE(engine->props.max_busywait_duration_ns, duration);
>> @@ -203,7 +204,7 @@ timeslice_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>           const char *buf, size_t count)
>>   {
>>       struct intel_engine_cs *engine = kobj_to_engine(kobj);
>> -    unsigned long long duration;
>> +    unsigned long long duration, clamped;
>>       int err;
>>         /*
>> @@ -218,7 +219,8 @@ timeslice_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>       if (err)
>>           return err;
>>   -    if (duration > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
>> +    clamped = intel_clamp_timeslice_duration_ms(engine, duration);
>> +    if (duration != clamped)
>>           return -EINVAL;
>>         WRITE_ONCE(engine->props.timeslice_duration_ms, duration);
>> @@ -256,7 +258,7 @@ stop_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>          const char *buf, size_t count)
>>   {
>>       struct intel_engine_cs *engine = kobj_to_engine(kobj);
>> -    unsigned long long duration;
>> +    unsigned long long duration, clamped;
>>       int err;
>>         /*
>> @@ -272,7 +274,8 @@ stop_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>       if (err)
>>           return err;
>>   -    if (duration > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
>> +    clamped = intel_clamp_stop_timeout_ms(engine, duration);
>> +    if (duration != clamped)
>>           return -EINVAL;
>>         WRITE_ONCE(engine->props.stop_timeout_ms, duration);
>> @@ -306,7 +309,7 @@ preempt_timeout_store(struct kobject *kobj, 
>> struct kobj_attribute *attr,
>>                 const char *buf, size_t count)
>>   {
>>       struct intel_engine_cs *engine = kobj_to_engine(kobj);
>> -    unsigned long long timeout;
>> +    unsigned long long timeout, clamped;
>>       int err;
>>         /*
>> @@ -322,7 +325,8 @@ preempt_timeout_store(struct kobject *kobj, 
>> struct kobj_attribute *attr,
>>       if (err)
>>           return err;
>>   -    if (timeout > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
>> +    clamped = intel_clamp_preempt_timeout_ms(engine, timeout);
>> +    if (timeout != clamped)
>>           return -EINVAL;
>>         WRITE_ONCE(engine->props.preempt_timeout_ms, timeout);
>> @@ -362,7 +366,7 @@ heartbeat_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>           const char *buf, size_t count)
>>   {
>>       struct intel_engine_cs *engine = kobj_to_engine(kobj);
>> -    unsigned long long delay;
>> +    unsigned long long delay, clamped;
>>       int err;
>>         /*
>> @@ -379,7 +383,8 @@ heartbeat_store(struct kobject *kobj, struct 
>> kobj_attribute *attr,
>>       if (err)
>>           return err;
>>   -    if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
>> +    clamped = intel_clamp_heartbeat_interval_ms(engine, delay);
>> +    if (delay != clamped)
>>           return -EINVAL;
>>         err = intel_engine_set_heartbeat(engine, delay);
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>> index 6a4612a852e2..ad131092f8df 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>> @@ -248,6 +248,15 @@ struct guc_lrc_desc {
>>     #define GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US 500000
>>   +/*
>> + * GuC converts the timeout to clock ticks internally. Different 
>> platforms have
>> + * different GuC clocks. Thus, the maximum value before overflow is 
>> platform
>> + * dependent. Current worst case scenario is about 110s. So, limit 
>> to 100s to be
>> + * safe.
>> + */
>> +#define GUC_POLICY_MAX_EXEC_QUANTUM_MS        (100 * 1000)
>> +#define GUC_POLICY_MAX_PREEMPT_TIMEOUT_MS    (100 * 1000)
>> +
>>   struct guc_policies {
>>       u32 submission_queue_depth[GUC_MAX_ENGINE_CLASSES];
>>       /* In micro seconds. How much time to allow before DPC 
>> processing is
>
> LGTM. Pretty please:
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index b3a429a92c0d..8208164c25e7 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -2218,13 +2218,24 @@ static inline u32 
> get_children_join_value(struct intel_context *ce,
>  static void guc_context_policy_init(struct intel_engine_cs *engine,
>                                     struct guc_lrc_desc *desc)
>  {
> +       struct drm_device *drm = &engine->i915->drm;
> +
>         desc->policy_flags = 0;
>
>         if (engine->flags & I915_ENGINE_WANT_FORCED_PREEMPTION)
>                 desc->policy_flags |= 
> CONTEXT_POLICY_FLAG_PREEMPT_TO_IDLE;
>
>         /* NB: For both of these, zero means disabled. */
> +       if (overflows_type(engine->props.timeslice_duration_ms * 1000,
> +                          desc->execution_quantum))
> +               drm_warn_once(drm, "GuC interface cannot support %lums 
> timeslice!\n",
> + engine->props.timeslice_duration_ms);
>         desc->execution_quantum = engine->props.timeslice_duration_ms 
> * 1000;
> +
> +       if (overflows_type(engine->props.preempt_timeout_ms * 1000,
> +                          desc->preemption_timeout))
> +               drm_warn_once(drm, "GuC interface cannot support %lums 
> preemption timeout!\n",
> + engine->props.preempt_timeout_ms);
>         desc->preemption_timeout = engine->props.preempt_timeout_ms * 
> 1000;
>  }
>
As per comments in other thread, this is redundant code and is too late 
in the chain to do anything. If such a check is present then should be a 
BUG not a drm_warn. The whole point of the helper function is to 
consolidate all limits in one place at the point where it is still 
possible to correct for problems.

> With that:
>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> Any idea what happened with the CI run? It's full of odd failures.
Not sure where 'rev3' came from, it's the 'rev2' code and I don't recall 
hitting the 'retest' button. Either way, that version just hit a merge 
conflict and didn't build. The previous two revs seemed to be hitting 
'failed to reset' problems in an execlist selftest on execlist only 
platforms. I'm not seeing how that could be related. The changes should 
only kick in on Gen12 platforms and/or with GuC submission but the 
failures are all on earlier platforms. I'm planning to grab a SKL or 
something to test on to make sure, I just haven't had the time to do so yet.

John.

>
> Regards,
>
> Tvrtko