[Intel-gfx] [PATCH i-g-t 6/7] gem_wsim: Busy stats balancers
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Sep 26 11:27:33 UTC 2017
On 25/09/2017 17:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-09-25 16:15:01)
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Add busy and busy-avg balancers which make balancing
>> decisions by looking at engine busyness via the i915 PMU.
>
> "And thus are able to make decisions on the actual instantaneous load of
> the system, and not use metrics that lag behind by a batch or two. In
> doing so, each client should be able to greedily maximise their own
> usage of the system, leading to improved load balancing even in the face
> of other uncooperative clients. On the other hand, we are only using the
> instantaneous load without coupling in the predictive factor for
> dispatch and execution length."
Ok, thanks for the text.
> Hmm, do you not want to sum busy + queued? Or at least compare! :)
How to add apples and oranges? :) Queued * busy [0.0 - 1.0] ?
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> ---
>> benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 140 insertions(+)
>>
>> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
>> index 82fe6ba9ec5f..9ee91fabb220 100644
>> --- a/benchmarks/gem_wsim.c
>> +++ b/benchmarks/gem_wsim.c
>> @@ -50,6 +50,7 @@
>> #include "intel_io.h"
>> #include "igt_aux.h"
>> #include "igt_rand.h"
>> +#include "igt_perf.h"
>> #include "sw_sync.h"
>>
>> #include "ewma.h"
>> @@ -188,6 +189,16 @@ struct workload
>> uint32_t last[NUM_ENGINES];
>> } rt;
>> };
>> +
>> + struct busy_balancer {
>> + int fd;
>> + bool first;
>> + unsigned int num_engines;
>> + unsigned int engine_map[5];
>> + uint64_t t_prev;
>> + uint64_t prev[5];
>> + double busy[5];
>> + } busy_balancer;
>> };
>>
>> static const unsigned int nop_calibration_us = 1000;
>> @@ -993,6 +1004,8 @@ struct workload_balancer {
>> unsigned int flags;
>> unsigned int min_gen;
>>
>> + int (*init)(const struct workload_balancer *balancer,
>> + struct workload *wrk);
>> unsigned int (*get_qd)(const struct workload_balancer *balancer,
>> struct workload *wrk,
>> enum intel_engine_id engine);
>> @@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer,
>> return get_vcs_engine(wrk->ctx_list[w->context].static_vcs);
>> }
>>
>> +static unsigned int
>> +get_engine_busy(const struct workload_balancer *balancer,
>> + struct workload *wrk, enum intel_engine_id engine)
>> +{
>> + struct busy_balancer *bb = &wrk->busy_balancer;
>> +
>> + if (engine == VCS2 && (wrk->flags & VCS2REMAP))
>> + engine = BCS;
>> +
>> + return bb->busy[bb->engine_map[engine]];
>> +}
>> +
>> +static void get_stats(const struct workload_balancer *b, struct workload *wrk)
>
> s/get_stats/get_pmu_stats/ ?
Ok.
>
>> +{
>> + struct busy_balancer *bb = &wrk->busy_balancer;
>> + uint64_t val[7];
>> + unsigned int i;
>> +
>> + igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val));
>> +
>> + if (!bb->first) {
>> + for (i = 0; i < bb->num_engines; i++) {
>> + double d;
>> +
>> + d = (val[2 + i] - bb->prev[i]) * 100;
>> + d /= val[1] - bb->t_prev;
>> + bb->busy[i] = d;
>> + }
>> + }
>> +
>> + for (i = 0; i < bb->num_engines; i++)
>> + bb->prev[i] = val[2 + i];
>> +
>> + bb->t_prev = val[1];
>> + bb->first = false;
>
> Ok.
Although normalizing to [0.0 - 100.0] is only helpful if one wants to
look at the numbers.
>
>> +}
>> +
>> +static enum intel_engine_id
>> +busy_avg_balance(const struct workload_balancer *balancer,
>> + struct workload *wrk, struct w_step *w)
>> +{
>> + get_stats(balancer, wrk);
>> +
>> + return qdavg_balance(balancer, wrk, w);
>> +}
>> +
>> +static enum intel_engine_id
>> +busy_balance(const struct workload_balancer *balancer,
>> + struct workload *wrk, struct w_step *w)
>> +{
>> + get_stats(balancer, wrk);
>> +
>> + return qd_balance(balancer, wrk, w);
>> +}
>> +
>> +static int
>> +busy_init(const struct workload_balancer *balancer, struct workload *wrk)
>> +{
>> + struct busy_balancer *bb = &wrk->busy_balancer;
>> + struct engine_desc {
>> + unsigned class, inst;
>> + enum intel_engine_id id;
>> + } *d, engines[] = {
>> + { I915_ENGINE_CLASS_RENDER, 0, RCS },
>> + { I915_ENGINE_CLASS_COPY, 0, BCS },
>> + { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
>> + { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
>> + { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
>> + { 0, 0, VCS }
>> + };
>> +
>> + bb->num_engines = 0;
>> + bb->first = true;
>> + bb->fd = -1;
>> +
>> + for (d = &engines[0]; d->id != VCS; d++) {
>> + int pfd;
>> +
>> + pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class,
>> + d->inst),
>> + bb->fd);
>> + if (pfd < 0) {
>> + if (d->id != VCS2)
>> + return -(10 + bb->num_engines);
>> + else
>> + continue;
>> + }
>> +
>> + if (bb->num_engines == 0)
>> + bb->fd = pfd;
>> +
>> + bb->engine_map[d->id] = bb->num_engines++;
>> + }
>> +
>> + if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP))
>> + return -1;
>
> Hmm, feels a little sloppy. Would be more concrete if we have a list of
> engines were going to use, and then we either created a perf event for
> each or died.
It is only a theoretical flaw, even though balancer init wo/ VCS2 would
succeed and would be reporting invalid data for it, the tool would fail
when trying to submit to VCS2 so no harm done.
>
> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
Thanks, I'll see if I will find the time to beautify it wrt/ the above
at some point.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list