[Intel-gfx] [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers

Tue Sep 26 11:27:33 UTC 2017

On 25/09/2017 17:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-25 16:15:01)
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Add busy and busy-avg balancers which make balancing
>> decisions by looking at engine busyness via the i915 PMU.
> 
> "And thus are able to make decisions on the actual instantaneous load of
> the system, and not use metrics that lag behind by a batch or two. In
> doing so, each client should be able to greedily maximise their own
> usage of the system, leading to improved load balancing even in the face
> of other uncooperative clients. On the other hand, we are only using the
> instantaneous load without coupling in the predictive factor for
> dispatch and execution length."

Ok, thanks for the text.

> Hmm, do you not want to sum busy + queued? Or at least compare! :)

How to add apples and oranges? :) Queued * busy [0.0 - 1.0] ?

>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> ---
>>   benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 140 insertions(+)
>>
>> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
>> index 82fe6ba9ec5f..9ee91fabb220 100644
>> --- a/benchmarks/gem_wsim.c
>> +++ b/benchmarks/gem_wsim.c
>> @@ -50,6 +50,7 @@
>>   #include "intel_io.h"
>>   #include "igt_aux.h"
>>   #include "igt_rand.h"
>> +#include "igt_perf.h"
>>   #include "sw_sync.h"
>>   
>>   #include "ewma.h"
>> @@ -188,6 +189,16 @@ struct workload
>>                          uint32_t last[NUM_ENGINES];
>>                  } rt;
>>          };
>> +
>> +       struct busy_balancer {
>> +               int fd;
>> +               bool first;
>> +               unsigned int num_engines;
>> +               unsigned int engine_map[5];
>> +               uint64_t t_prev;
>> +               uint64_t prev[5];
>> +               double busy[5];
>> +       } busy_balancer;
>>   };
>>   
>>   static const unsigned int nop_calibration_us = 1000;
>> @@ -993,6 +1004,8 @@ struct workload_balancer {
>>          unsigned int flags;
>>          unsigned int min_gen;
>>   
>> +       int (*init)(const struct workload_balancer *balancer,
>> +                   struct workload *wrk);
>>          unsigned int (*get_qd)(const struct workload_balancer *balancer,
>>                                 struct workload *wrk,
>>                                 enum intel_engine_id engine);
>> @@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer,
>>          return get_vcs_engine(wrk->ctx_list[w->context].static_vcs);
>>   }
>>   
>> +static unsigned int
>> +get_engine_busy(const struct workload_balancer *balancer,
>> +               struct workload *wrk, enum intel_engine_id engine)
>> +{
>> +       struct busy_balancer *bb = &wrk->busy_balancer;
>> +
>> +       if (engine == VCS2 && (wrk->flags & VCS2REMAP))
>> +               engine = BCS;
>> +
>> +       return bb->busy[bb->engine_map[engine]];
>> +}
>> +
>> +static void get_stats(const struct workload_balancer *b, struct workload *wrk)
> 
> s/get_stats/get_pmu_stats/ ?

Ok.

> 
>> +{
>> +       struct busy_balancer *bb = &wrk->busy_balancer;
>> +       uint64_t val[7];
>> +       unsigned int i;
>> +
>> +       igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val));
>> +
>> +       if (!bb->first) {
>> +               for (i = 0; i < bb->num_engines; i++) {
>> +                       double d;
>> +
>> +                       d = (val[2 + i] - bb->prev[i]) * 100;
>> +                       d /= val[1] - bb->t_prev;
>> +                       bb->busy[i] = d;
>> +               }
>> +       }
>> +
>> +       for (i = 0; i < bb->num_engines; i++)
>> +               bb->prev[i] = val[2 + i];
>> +
>> +       bb->t_prev = val[1];
>> +       bb->first = false;
> 
> Ok.

Although normalizing to [0.0 - 100.0] is only helpful if one wants to 
look at the numbers.

> 
>> +}
>> +
>> +static enum intel_engine_id
>> +busy_avg_balance(const struct workload_balancer *balancer,
>> +                struct workload *wrk, struct w_step *w)
>> +{
>> +       get_stats(balancer, wrk);
>> +
>> +       return qdavg_balance(balancer, wrk, w);
>> +}
>> +
>> +static enum intel_engine_id
>> +busy_balance(const struct workload_balancer *balancer,
>> +            struct workload *wrk, struct w_step *w)
>> +{
>> +       get_stats(balancer, wrk);
>> +
>> +       return qd_balance(balancer, wrk, w);
>> +}
>> +
>> +static int
>> +busy_init(const struct workload_balancer *balancer, struct workload *wrk)
>> +{
>> +       struct busy_balancer *bb = &wrk->busy_balancer;
>> +       struct engine_desc {
>> +               unsigned class, inst;
>> +               enum intel_engine_id id;
>> +       } *d, engines[] = {
>> +               { I915_ENGINE_CLASS_RENDER, 0, RCS },
>> +               { I915_ENGINE_CLASS_COPY, 0, BCS },
>> +               { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
>> +               { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
>> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
>> +               { 0, 0, VCS }
>> +       };
>> +
>> +       bb->num_engines = 0;
>> +       bb->first = true;
>> +       bb->fd = -1;
>> +
>> +       for (d = &engines[0]; d->id != VCS; d++) {
>> +               int pfd;
>> +
>> +               pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
>> +                                                               d->inst),
>> +                                          bb->fd);
>> +               if (pfd < 0) {
>> +                       if (d->id != VCS2)
>> +                               return -(10 + bb->num_engines);
>> +                       else
>> +                               continue;
>> +               }
>> +
>> +               if (bb->num_engines == 0)
>> +                       bb->fd = pfd;
>> +
>> +               bb->engine_map[d->id] = bb->num_engines++;
>> +       }
>> +
>> +       if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
>> +               return -1;
> 
> Hmm, feels a little sloppy. Would be more concrete if we have a list of
> engines were going to use, and then we either created a perf event for
> each or died.

It is only a theoretical flaw, even though balancer init wo/ VCS2 would 
succeed and would be reporting invalid data for it, the tool would fail 
when trying to submit to VCS2 so no harm done.

> 
> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>

Thanks, I'll see if I will find the time to beautify it wrt/ the above 
at some point.

Regards,

Tvrtko