[Intel-gfx] [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers

Mon Sep 25 16:44:14 UTC 2017

Quoting Tvrtko Ursulin (2017-09-25 16:15:01)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Add busy and busy-avg balancers which make balancing
> decisions by looking at engine busyness via the i915 PMU.

"And thus are able to make decisions on the actual instantaneous load of
the system, and not use metrics that lag behind by a batch or two. In
doing so, each client should be able to greedily maximise their own
usage of the system, leading to improved load balancing even in the face
of other uncooperative clients. On the other hand, we are only using the
instantaneous load without coupling in the predictive factor for
dispatch and execution length."

Hmm, do you not want to sum busy + queued? Or at least compare! :)

> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>  benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 140 insertions(+)
> 
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index 82fe6ba9ec5f..9ee91fabb220 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -50,6 +50,7 @@
>  #include "intel_io.h"
>  #include "igt_aux.h"
>  #include "igt_rand.h"
> +#include "igt_perf.h"
>  #include "sw_sync.h"
>  
>  #include "ewma.h"
> @@ -188,6 +189,16 @@ struct workload
>                         uint32_t last[NUM_ENGINES];
>                 } rt;
>         };
> +
> +       struct busy_balancer {
> +               int fd;
> +               bool first;
> +               unsigned int num_engines;
> +               unsigned int engine_map[5];
> +               uint64_t t_prev;
> +               uint64_t prev[5];
> +               double busy[5];
> +       } busy_balancer;
>  };
>  
>  static const unsigned int nop_calibration_us = 1000;
> @@ -993,6 +1004,8 @@ struct workload_balancer {
>         unsigned int flags;
>         unsigned int min_gen;
>  
> +       int (*init)(const struct workload_balancer *balancer,
> +                   struct workload *wrk);
>         unsigned int (*get_qd)(const struct workload_balancer *balancer,
>                                struct workload *wrk,
>                                enum intel_engine_id engine);
> @@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer,
>         return get_vcs_engine(wrk->ctx_list[w->context].static_vcs);
>  }
>  
> +static unsigned int
> +get_engine_busy(const struct workload_balancer *balancer,
> +               struct workload *wrk, enum intel_engine_id engine)
> +{
> +       struct busy_balancer *bb = &wrk->busy_balancer;
> +
> +       if (engine == VCS2 && (wrk->flags & VCS2REMAP))
> +               engine = BCS;
> +
> +       return bb->busy[bb->engine_map[engine]];
> +}
> +
> +static void get_stats(const struct workload_balancer *b, struct workload *wrk)

s/get_stats/get_pmu_stats/ ?

> +{
> +       struct busy_balancer *bb = &wrk->busy_balancer;
> +       uint64_t val[7];
> +       unsigned int i;
> +
> +       igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val));
> +
> +       if (!bb->first) {
> +               for (i = 0; i < bb->num_engines; i++) {
> +                       double d;
> +
> +                       d = (val[2 + i] - bb->prev[i]) * 100;
> +                       d /= val[1] - bb->t_prev;
> +                       bb->busy[i] = d;
> +               }
> +       }
> +
> +       for (i = 0; i < bb->num_engines; i++)
> +               bb->prev[i] = val[2 + i];
> +
> +       bb->t_prev = val[1];
> +       bb->first = false;

Ok.

> +}
> +
> +static enum intel_engine_id
> +busy_avg_balance(const struct workload_balancer *balancer,
> +                struct workload *wrk, struct w_step *w)
> +{
> +       get_stats(balancer, wrk);
> +
> +       return qdavg_balance(balancer, wrk, w);
> +}
> +
> +static enum intel_engine_id
> +busy_balance(const struct workload_balancer *balancer,
> +            struct workload *wrk, struct w_step *w)
> +{
> +       get_stats(balancer, wrk);
> +
> +       return qd_balance(balancer, wrk, w);
> +}
> +
> +static int
> +busy_init(const struct workload_balancer *balancer, struct workload *wrk)
> +{
> +       struct busy_balancer *bb = &wrk->busy_balancer;
> +       struct engine_desc {
> +               unsigned class, inst;
> +               enum intel_engine_id id;
> +       } *d, engines[] = {
> +               { I915_ENGINE_CLASS_RENDER, 0, RCS },
> +               { I915_ENGINE_CLASS_COPY, 0, BCS },
> +               { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
> +               { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
> +               { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
> +               { 0, 0, VCS }
> +       };
> +
> +       bb->num_engines = 0;
> +       bb->first = true;
> +       bb->fd = -1;
> +
> +       for (d = &engines[0]; d->id != VCS; d++) {
> +               int pfd;
> +
> +               pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
> +                                                               d->inst),
> +                                          bb->fd);
> +               if (pfd < 0) {
> +                       if (d->id != VCS2)
> +                               return -(10 + bb->num_engines);
> +                       else
> +                               continue;
> +               }
> +
> +               if (bb->num_engines == 0)
> +                       bb->fd = pfd;
> +
> +               bb->engine_map[d->id] = bb->num_engines++;
> +       }
> +
> +       if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
> +               return -1;

Hmm, feels a little sloppy. Would be more concrete if we have a list of
engines were going to use, and then we either created a perf event for
each or died.

Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
-Chris