[Intel-gfx] [PATCH i-g-t] gem_wsim: Use CTX_TIMESTAMP for timed spinners
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Mon Nov 2 17:14:24 UTC 2020
On 02/11/2020 15:33, Chris Wilson wrote:
> Use MI_MATH and MI_COND_BBE we can construct a loop that runs for a
> precise number of clock cycles, as measured by the CTX_TIMESTAMP. We use
> the CTX_TIMESTAMP (as opposed to the CS_TIMESTAMP) so that the elapsed
> time is measured local to the context, and the length of the batch is
> unaffected by preemption. Since the clock ticks at a known frequency, we
> can directly translate the batch durations into cycles and so remove the
> requirement for nop calibration, and the often excessively large nop
> batches.
>
> The downside to this is that we need to use engine local registers, and
> before gen11 there is no support in the CS for relative mmio and so this
> technique does not support transparent load balancing on a virtual
> engine before Icelake.
I am enthusiastic, just that I don't have a local Gen11+ DUT but that's
secondary.
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> benchmarks/gem_wsim.c | 524 ++++++++++++++----------------------------
> 1 file changed, 169 insertions(+), 355 deletions(-)
>
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index dbb46b9aa..5d67468d1 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -176,10 +176,9 @@ struct w_step
>
> struct drm_i915_gem_execbuffer2 eb;
> struct drm_i915_gem_exec_object2 *obj;
> - struct drm_i915_gem_relocation_entry reloc[1];
> - unsigned long bb_sz;
> + struct drm_i915_gem_relocation_entry reloc[3];
> uint32_t bb_handle;
> - uint32_t *recursive_bb_start;
> + uint32_t *bb_duration;
> };
>
> struct ctx {
> @@ -227,10 +226,6 @@ struct workload
> unsigned int nrequest[NUM_ENGINES];
> };
>
> -static const unsigned int nop_calibration_us = 1000;
> -static bool has_nop_calibration = false;
> -static bool sequential = true;
> -
> static unsigned int master_prng;
>
> static int verbose = 1;
> @@ -253,59 +248,67 @@ static const char *ring_str_map[NUM_ENGINES] = {
> [VECS] = "VECS",
> };
>
> -/* stores calibrations for particular engines */
> -static unsigned long engine_calib_map[NUM_ENGINES];
> -
> -static enum intel_engine_id
> -ci_to_engine_id(int class, int instance)
> -{
> - static const struct {
> - int class;
> - int instance;
> - unsigned int id;
> - } map[] = {
> - { I915_ENGINE_CLASS_RENDER, 0, RCS },
> - { I915_ENGINE_CLASS_COPY, 0, BCS },
> - { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
> - { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
> - { I915_ENGINE_CLASS_VIDEO, 2, VCS2 }, /* FIXME/ICL */
> - { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
> +static int read_timestamp_frequency(int i915)
> +{
> + int value = 0;
> + drm_i915_getparam_t gp = {
> + .value = &value,
> + .param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
> };
> -
> - unsigned int i;
> -
> - for (i = 0; i < ARRAY_SIZE(map); i++) {
> - if (class == map[i].class && instance == map[i].instance)
> - return map[i].id;
> - }
> - return -1;
> + ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
> + return value;
> }
>
> -static void
> -apply_unset_calibrations(unsigned long raw_number)
> +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
> {
> - for (int i = 0; i < NUM_ENGINES; i++)
> - engine_calib_map[i] += engine_calib_map[i] ? 0 : raw_number;
> + return (x + y - 1) / y;
> }
>
> -static void
> -print_engine_calibrations(void)
> +static uint64_t ns_to_ctx_ticks(uint64_t ns)
> {
> - bool first_entry = true;
> + static long f;
>
> - printf("Nop calibration for %uus delay is: ", nop_calibration_us);
> - for (int i = 0; i < NUM_ENGINES; i++) {
> - /* skip engines not present and DEFAULT and VCS */
> - if (i != DEFAULT && i != VCS && engine_calib_map[i]) {
> - if (first_entry) {
> - printf("%s=%lu", ring_str_map[i], engine_calib_map[i]);
> - first_entry = false;
> - } else {
> - printf(",%s=%lu", ring_str_map[i], engine_calib_map[i]);
> - }
> - }
> + if (!f) {
> + f = read_timestamp_frequency(fd);
> + if (intel_gen(intel_get_drm_devid(fd)) == 11)
> + f = 12500000; /* icl!!! are you feeling alright? */
What does the comment refer to?
Should there be an assert here if < gen11?
> }
> - printf("\n");
> +
> + return div64_u64_round_up(ns * f, NSEC_PER_SEC);
> +}
> +
> +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
> +
> +#define MI_MATH(x) MI_INSTR(0x1a, (x) - 1)
> +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
> +/* Opcodes for MI_MATH_INSTR */
> +#define MI_MATH_NOOP MI_MATH_INSTR(0x000, 0x0, 0x0)
> +#define MI_MATH_LOAD(op1, op2) MI_MATH_INSTR(0x080, op1, op2)
> +#define MI_MATH_LOADINV(op1, op2) MI_MATH_INSTR(0x480, op1, op2)
> +#define MI_MATH_LOAD0(op1) MI_MATH_INSTR(0x081, op1)
> +#define MI_MATH_LOAD1(op1) MI_MATH_INSTR(0x481, op1)
> +#define MI_MATH_ADD MI_MATH_INSTR(0x100, 0x0, 0x0)
> +#define MI_MATH_SUB MI_MATH_INSTR(0x101, 0x0, 0x0)
> +#define MI_MATH_AND MI_MATH_INSTR(0x102, 0x0, 0x0)
> +#define MI_MATH_OR MI_MATH_INSTR(0x103, 0x0, 0x0)
> +#define MI_MATH_XOR MI_MATH_INSTR(0x104, 0x0, 0x0)
> +#define MI_MATH_STORE(op1, op2) MI_MATH_INSTR(0x180, op1, op2)
> +#define MI_MATH_STOREINV(op1, op2) MI_MATH_INSTR(0x580, op1, op2)
> +/* Registers used as operands in MI_MATH_INSTR */
> +#define MI_MATH_REG(x) (x)
> +#define MI_MATH_REG_SRCA 0x20
> +#define MI_MATH_REG_SRCB 0x21
> +#define MI_MATH_REG_ACCU 0x31
> +#define MI_MATH_REG_ZF 0x32
> +#define MI_MATH_REG_CF 0x33
> +
> +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1)
> +#define MI_CS_MMIO_DST BIT(19)
> +#define MI_CS_MMIO_SRC BIT(18)
> +
> +static unsigned int offset_in_page(void *addr)
> +{
> + return (uintptr_t)addr & 4095;
> }
>
> static void add_dep(struct deps *deps, struct dep_entry entry)
> @@ -1392,91 +1395,116 @@ __get_ctx(struct workload *wrk, const struct w_step *w)
> return &wrk->ctx_list[w->context];
> }
>
> -static unsigned long
> -__get_bb_sz(const struct w_step *w, unsigned int duration)
> -{
> - enum intel_engine_id engine = w->engine;
> - struct ctx *ctx = __get_ctx(w->wrk, w);
> - unsigned long d;
> -
> - if (ctx->engine_map && engine == DEFAULT)
> - /* Assume first engine calibration. */
> - engine = ctx->engine_map[0];
> -
> - igt_assert(engine_calib_map[engine]);
> - d = ALIGN(duration * engine_calib_map[engine] * sizeof(uint32_t) /
> - nop_calibration_us,
> - sizeof(uint32_t));
> -
> - return d;
> -}
> -
> -static unsigned long
> -get_bb_sz(const struct w_step *w, unsigned int duration)
> +static uint32_t mmio_base(int i915, enum intel_engine_id engine, int gen)
> {
> - unsigned long d = __get_bb_sz(w, duration);
> -
> - igt_assert(d);
> + const char *name;
>
> - return d;
> -}
> + if (gen >= 11)
> + return 0;
>
> -static void init_bb(struct w_step *w)
> -{
> - const unsigned int arb_period =
> - __get_bb_sz(w, w->preempt_us) / sizeof(uint32_t);
> - const unsigned int mmap_len = ALIGN(w->bb_sz, 4096);
> - unsigned int i;
> - uint32_t *ptr;
> + switch (engine) {
> + case NUM_ENGINES:
> + default:
> + return 0;
>
> - if (w->unbound_duration || !arb_period)
> - return;
> + case DEFAULT:
> + case RCS:
> + name = "rcs0";
> + break;
>
> - gem_set_domain(fd, w->bb_handle,
> - I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
> + case BCS:
> + name = "bcs0";
> + break;
>
> - ptr = gem_mmap__wc(fd, w->bb_handle, 0, mmap_len, PROT_WRITE);
> + case VCS:
> + case VCS1:
> + name = "vcs0";
> + break;
> + case VCS2:
> + name = "vcs1";
> + break;
>
> - for (i = arb_period; i < w->bb_sz / sizeof(uint32_t); i += arb_period)
> - ptr[i] = 0x5 << 23; /* MI_ARB_CHK */
> + case VECS:
> + name = "vecs0";
> + break;
> + }
>
> - munmap(ptr, mmap_len);
> + return gem_engine_mmio_base(i915, name);
Why is mmio base needed if relative addressing is used? Maybe I'll
figure it out after reading further.
> }
>
> -static unsigned int terminate_bb(struct w_step *w)
> +static unsigned int create_bb(struct w_step *w, int self)
> {
> - const uint32_t bbe = 0xa << 23;
> - unsigned long mmap_start, mmap_len;
> - unsigned long batch_start = w->bb_sz;
> + const int gen = intel_gen(intel_get_drm_devid(fd));
> + const uint32_t base = mmio_base(fd, w->engine, gen);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define TIMESTAMP (base + 0x3a8)
> + const int use_64b = gen >= 8;
> + enum { START_TS, NOW_TS };
> + uint32_t *ptr, *cs, *jmp;
> unsigned int r = 0;
> - uint32_t *ptr, *cs;
> -
> - batch_start -= sizeof(uint32_t); /* bbend */
> -
> - if (w->unbound_duration)
> - batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
> -
> - mmap_start = rounddown(batch_start, PAGE_SIZE);
> - mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
>
> gem_set_domain(fd, w->bb_handle,
> I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
>
> - ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> - cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
> + cs = ptr = gem_mmap__wc(fd, w->bb_handle, 0, 4096, PROT_WRITE);
>
> - if (w->unbound_duration) {
> - w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
> - batch_start += 4 * sizeof(uint32_t);
> + *cs++ = MI_LOAD_REGISTER_IMM | MI_CS_MMIO_DST;
> + *cs++ = CS_GPR(START_TS) + 4;
What is "+ 4"?
> + *cs++ = 0;
> + *cs++ = MI_LOAD_REGISTER_REG | MI_CS_MMIO_DST | MI_CS_MMIO_SRC;
> + *cs++ = TIMESTAMP;
> + *cs++ = CS_GPR(START_TS);
>
> - *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
> - w->recursive_bb_start = cs;
> - *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
> + if (offset_in_page(cs) & 4)
> *cs++ = 0;
> + jmp = cs;
> +
> + if (w->preempt_us)
> + *cs++ = 0x5 << 23; /* MI_ARB_CHECK */
> +
> + *cs++ = MI_LOAD_REGISTER_IMM | MI_CS_MMIO_DST;
> + *cs++ = CS_GPR(NOW_TS) + 4;
> + *cs++ = 0;
> + *cs++ = MI_LOAD_REGISTER_REG | MI_CS_MMIO_DST | MI_CS_MMIO_SRC;
> + *cs++ = TIMESTAMP;
> + *cs++ = CS_GPR(NOW_TS);
> +
> + *cs++ = MI_MATH(4);
> + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
> + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
MI_MATH_REG is aliased to CS_GPR?
> + *cs++ = MI_MATH_SUB;
> + *cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
> +
> + *cs++ = 0x24 << 23 | (1 + use_64b) | MI_CS_MMIO_DST; /* SRM */
All others have nice defines but SRM, any special reason?
> + *cs++ = CS_GPR(NOW_TS);
> + w->reloc[r].target_handle = self;
> + w->reloc[r].offset = offset_in_page(cs);
> + *cs++ = w->reloc[r].delta = 4000;
> + *cs++ = 0;
> + r++;
> +
> + /* Delay between SRM and COND_BBE to post the writes */
> + for (int n = 0; n < 8; n++) {
> + *cs++ = MI_INSTR(0x21, 1);
> + *cs++ = 2048;
> *cs++ = 0;
Whats this instruction? Add a define so it is self-documenting?
> }
>
> - *cs = bbe;
> + *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
> + w->bb_duration = cs;
> + *cs++ = 0;
> + w->reloc[r].target_handle = self;
> + w->reloc[r].offset = offset_in_page(cs);
> + *cs++ = w->reloc[r].delta = 4000;
> + *cs++ = 0;
> + r++;
> +
> + *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
> + w->reloc[r].target_handle = self;
> + w->reloc[r].offset = offset_in_page(cs);
> + *cs++ = w->reloc[r].delta = offset_in_page(jmp);
Presumably MI_MATH stuff relaxed the loop enough and we don't need any
extra noops?
I would appreaciate a banner style comment explaining the batch layout
mentioning the interesting offsets and high-level logic.
> + *cs++ = 0;
> + r++;
>
> return r;
> }
> @@ -1590,23 +1618,10 @@ alloc_step_batch(struct workload *wrk, struct w_step *w)
> igt_assert(j < nr_obj);
> }
>
> - if (w->unbound_duration)
> - /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
> - w->bb_sz = max(PAGE_SIZE, __get_bb_sz(w, w->preempt_us)) +
> - (1 + 3) * sizeof(uint32_t);
> - else
> - w->bb_sz = get_bb_sz(w, w->duration.max);
> -
> - w->bb_handle = w->obj[j].handle =
> - alloc_bo(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
> - init_bb(w);
> - w->obj[j].relocation_count = terminate_bb(w);
> -
> - if (w->obj[j].relocation_count) {
> - igt_assert(w->unbound_duration);
> - w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
> - w->reloc[0].target_handle = j;
> - }
> + w->bb_handle = w->obj[j].handle = gem_create(fd, 4096);
> + w->obj[j].relocation_count = create_bb(w, j);
> + igt_assert(w->obj[j].relocation_count <= ARRAY_SIZE(w->reloc));
> + w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
>
> w->eb.buffers_ptr = to_user_pointer(w->obj);
> w->eb.buffer_count = j + 1;
> @@ -1617,8 +1632,8 @@ alloc_step_batch(struct workload *wrk, struct w_step *w)
> printf("%u: %u:|", w->idx, w->eb.buffer_count);
> for (i = 0; i <= j; i++)
> printf("%x|", w->obj[i].handle);
> - printf(" %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
> - w->bb_sz, w->eb.flags, w->bb_handle, j, w->context,
> + printf(" flags=%llx bb=%x[%u] ctx[%u]=%u\n",
> + w->eb.flags, w->bb_handle, j, w->context,
> get_ctxid(wrk, w));
> #endif
> }
> @@ -1803,7 +1818,7 @@ static void measure_active_set(struct workload *wrk)
> if (w->type != BATCH)
> continue;
>
> - batch_sizes += w->bb_sz;
> + batch_sizes += 4096;
>
> for (j = 0; j < w->data_deps.nr; j++) {
> struct dep_entry *dep = &w->data_deps.list[j];
> @@ -1904,6 +1919,10 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
> wsim_err("Load balancing needs an engine map!\n");
> return 1;
> }
> + if (intel_gen(intel_get_drm_devid(fd)) < 11) {
> + wsim_err("Load balancing needs relative mmio support, gen11+!\n");
> + return 1;
> + }
> ctx->load_balance = w->load_balance;
> } else if (w->type == BOND) {
> if (!ctx->load_balance) {
> @@ -2163,15 +2182,15 @@ static int elapsed_us(const struct timespec *start, const struct timespec *end)
> }
>
> static void
> -update_bb_start(struct w_step *w)
> +update_bb_start(struct workload *wrk, struct w_step *w)
I think there is w->wrk if you find it easier but it's only one callsite
so it's probably even better like this.
> {
> - if (!w->unbound_duration)
> - return;
> + uint32_t ticks;
>
> - gem_set_domain(fd, w->bb_handle,
> - I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
> + ticks = 0;
> + if (!w->unbound_duration)
> + ticks = ~ns_to_ctx_ticks(1000 * get_duration(wrk, w));
Hm inverted ticks, why? And since it is not obvious I think it deserves
a comment.
>
> - *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
> + *w->bb_duration = ticks;
> }
>
> static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
> @@ -2198,13 +2217,7 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine)
> unsigned int i;
>
> eb_update_flags(wrk, w, engine);
> - update_bb_start(w);
> -
> - w->eb.batch_start_offset =
> - w->unbound_duration ?
> - 0 :
> - ALIGN(w->bb_sz - get_bb_sz(w, get_duration(wrk, w)),
> - 2 * sizeof(uint32_t));
> + update_bb_start(wrk, w);
>
> for (i = 0; i < w->fence_deps.nr; i++) {
> int tgt = w->idx + w->fence_deps.list[i].target;
> @@ -2353,8 +2366,7 @@ static void *run_workload(void *data)
> igt_assert(wrk->steps[t_idx].type == BATCH);
> igt_assert(wrk->steps[t_idx].unbound_duration);
>
> - *wrk->steps[t_idx].recursive_bb_start =
> - MI_BATCH_BUFFER_END;
> + *wrk->steps[t_idx].bb_duration = 0xffffffff;
> __sync_synchronize();
> continue;
> } else if (w->type == SSEU) {
> @@ -2467,131 +2479,15 @@ static void fini_workload(struct workload *wrk)
> free(wrk);
> }
>
> -static unsigned long calibrate_nop(unsigned int tolerance_pct, struct intel_execution_engine2 *engine)
> -{
> - const uint32_t bbe = 0xa << 23;
> - unsigned int loops = 17;
> - unsigned int usecs = nop_calibration_us;
> - struct drm_i915_gem_exec_object2 obj = {};
> - struct drm_i915_gem_execbuffer2 eb = {
> - .buffer_count = 1,
> - .buffers_ptr = (uintptr_t)&obj,
> - .flags = engine->flags
> - };
> - long size, last_size;
> - struct timespec t_0, t_end;
> -
> - clock_gettime(CLOCK_MONOTONIC, &t_0);
> -
> - size = 256 * 1024;
> - do {
> - struct timespec t_start;
> -
> - obj.handle = alloc_bo(fd, size);
> - gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
> - sizeof(bbe));
> - gem_execbuf(fd, &eb);
> - gem_sync(fd, obj.handle);
> -
> - clock_gettime(CLOCK_MONOTONIC, &t_start);
> - for (int loop = 0; loop < loops; loop++)
> - gem_execbuf(fd, &eb);
> - gem_sync(fd, obj.handle);
> - clock_gettime(CLOCK_MONOTONIC, &t_end);
> -
> - gem_close(fd, obj.handle);
> -
> - last_size = size;
> - size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
> - size = ALIGN(size, sizeof(uint32_t));
> - } while (elapsed(&t_0, &t_end) < 5 ||
> - labs(size - last_size) > (size * tolerance_pct / 100));
> -
> - return size / sizeof(uint32_t);
> -}
> -
> -static void
> -calibrate_sequentially(void)
> -{
> - struct intel_execution_engine2 *engine;
> - enum intel_engine_id eng_id;
> -
> - __for_each_physical_engine(fd, engine) {
> - eng_id = ci_to_engine_id(engine->class, engine->instance);
> - igt_assert(eng_id >= 0);
> - engine_calib_map[eng_id] = calibrate_nop(fd, engine);
> - }
> -}
> -
> -struct thread_data {
> - struct intel_execution_engine2 *eng;
> - pthread_t thr;
> - unsigned long calib;
> -};
> -
> -static void *
> -engine_calibration_thread(void *data)
> -{
> - struct thread_data *thr_d = (struct thread_data *) data;
> -
> - thr_d->calib = calibrate_nop(fd, thr_d->eng);
> - return NULL;
> -}
> -
> -static void
> -calibrate_in_parallel(void)
> -{
> - struct thread_data *thr_d = malloc(NUM_ENGINES * sizeof(*thr_d));
> - struct intel_execution_engine2 *engine;
> - enum intel_engine_id id;
> - int ret;
> -
> - __for_each_physical_engine(fd, engine) {
> - id = ci_to_engine_id(engine->class, engine->instance);
> - thr_d[id].eng = engine;
> - ret = pthread_create(&thr_d[id].thr, NULL, engine_calibration_thread, &thr_d[id]);
> - igt_assert_eq(ret, 0);
> - }
> -
> - __for_each_physical_engine(fd, engine) {
> - id = ci_to_engine_id(engine->class, engine->instance);
> - igt_assert(id >= 0);
> -
> - ret = pthread_join(thr_d[id].thr, NULL);
> - igt_assert_eq(ret, 0);
> - engine_calib_map[id] = thr_d[id].calib;
> - }
> -
> - free(thr_d);
> -}
> -
> -static void
> -calibrate_engines(void)
> -{
> - if (sequential)
> - calibrate_sequentially();
> - else
> - calibrate_in_parallel();
> -}
> -
> static void print_help(void)
> {
> puts(
> "Usage: gem_wsim [OPTIONS]\n"
> "\n"
> "Runs a simulated workload on the GPU.\n"
> -"When ran without arguments performs a GPU calibration result of which needs to\n"
> -"be provided when running the simulation in subsequent invocations.\n"
> -"\n"
> "Options:\n"
> " -h This text.\n"
> " -q Be quiet - do not output anything to stdout.\n"
> -" -n <n | Nop calibration value - single value is set to all engines\n"
> -" e1=v1,e2=v2,n...> without specified value; you can also specify calibrations for\n"
> -" particular engines.\n"
> -" -t <n> Nop calibration tolerance percentage.\n"
> -" -T Disable sequential calibration and perform calibration in parallel.\n"
> -" Use when there is a difficulty obtaining calibration with the\n"
> " default settings.\n"
One more line to snip here.
> " -I <n> Initial randomness seed.\n"
> " -p <n> Context priority to use for the following workload on the\n"
> @@ -2671,17 +2567,12 @@ int main(int argc, char **argv)
> int master_workload = -1;
> char *append_workload_arg = NULL;
> struct w_arg *w_args = NULL;
> - unsigned int tolerance_pct = 1;
> int exitcode = EXIT_FAILURE;
> double scale_time = 1.0f;
> double scale_dur = 1.0f;
> int prio = 0;
> double t;
> - int i, c;
> - char *subopts, *value;
> - int raw_number = 0;
> - long calib_val;
> - int eng;
> + int i, c, ret;
>
> /*
> * Open the device via the low-level API so we can do the GPU quiesce
> @@ -2721,70 +2612,7 @@ int main(int argc, char **argv)
> case 'c':
> clients = strtol(optarg, NULL, 0);
> break;
> - case 't':
> - tolerance_pct = strtol(optarg, NULL, 0);
> - break;
> - case 'T':
> - sequential = false;
> - break;
> -
> - case 'n':
> - subopts = optarg;
> - while (*subopts != '\0') {
> - eng = getsubopt(&subopts, (char **)ring_str_map, &value);
> - if (!value) {
> - /* only engine name was given */
> - wsim_err("Missing calibration value for '%s'!\n",
> - ring_str_map[eng]);
> - goto err;
> - }
>
> - calib_val = atol(value);
> -
> - if (eng >= 0 && eng < NUM_ENGINES) {
> - /* engine name with some value were given */
> -
> - if (eng == DEFAULT || eng == VCS) {
> - wsim_err("'%s' not allowed in engine calibrations!\n",
> - ring_str_map[eng]);
> - goto err;
> - } else if (calib_val <= 0) {
> - wsim_err("Invalid calibration for engine '%s' - value "
> - "is either non-positive or is not a number!\n",
> - ring_str_map[eng]);
> - goto err;
> - } else if (engine_calib_map[eng]) {
> - wsim_err("Invalid repeated calibration of '%s'!\n",
> - ring_str_map[eng]);
> - goto err;
> - } else {
> - engine_calib_map[eng] = calib_val;
> - if (eng == RCS)
> - engine_calib_map[DEFAULT] = calib_val;
> - else if (eng == VCS1 || eng == VCS2)
> - engine_calib_map[VCS] = calib_val;
> - has_nop_calibration = true;
> - }
> - } else {
> - /* raw number was given */
> -
> - if (!calib_val) {
> - wsim_err("Invalid engine or zero calibration!\n");
> - goto err;
> - } else if (calib_val < 0) {
> - wsim_err("Invalid negative calibration!\n");
> - goto err;
> - } else if (raw_number) {
> - wsim_err("Default engine calibration provided more than once!\n");
> - goto err;
> - } else {
> - raw_number = calib_val;
> - apply_unset_calibrations(raw_number);
> - has_nop_calibration = true;
> - }
> - }
> - }
> - break;
> case 'r':
> repeat = strtol(optarg, NULL, 0);
> break;
> @@ -2812,6 +2640,9 @@ int main(int argc, char **argv)
> case 'F':
> scale_time = atof(optarg);
> break;
> + case 'n':
> + /* ignored; using HW timers */
> + break;
For what user? I deleted media-bench.pl but maybe you are using it locally?
> case 'h':
> print_help();
> goto out;
> @@ -2820,19 +2651,6 @@ int main(int argc, char **argv)
> }
> }
>
> - if (!has_nop_calibration) {
> - if (verbose > 1) {
> - printf("Calibrating nop delays with %u%% tolerance...\n",
> - tolerance_pct);
> - }
> -
> - calibrate_engines();
> -
> - if (verbose)
> - print_engine_calibrations();
> - goto out;
> - }
> -
> if (!nr_w_args) {
> wsim_err("No workload descriptor(s)!\n");
> goto err;
> @@ -2885,7 +2703,6 @@ int main(int argc, char **argv)
>
> if (verbose > 1) {
> printf("Random seed is %u.\n", master_prng);
> - print_engine_calibrations();
> printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
> }
>
> @@ -2916,16 +2733,13 @@ int main(int argc, char **argv)
> clock_gettime(CLOCK_MONOTONIC, &t_start);
>
> for (i = 0; i < clients; i++) {
> - int ret;
> -
> ret = pthread_create(&w[i]->thread, NULL, run_workload, w[i]);
> igt_assert_eq(ret, 0);
> }
>
> if (master_workload >= 0) {
> - int ret = pthread_join(w[master_workload]->thread, NULL);
> -
> - igt_assert(ret == 0);
> + ret = pthread_join(w[master_workload]->thread, NULL);
> + igt_assert_eq(ret, 0);
>
> for (i = 0; i < clients; i++)
> w[i]->run = false;
> @@ -2933,8 +2747,8 @@ int main(int argc, char **argv)
>
> for (i = 0; i < clients; i++) {
> if (master_workload != i) {
> - int ret = pthread_join(w[i]->thread, NULL);
> - igt_assert(ret == 0);
> + ret = pthread_join(w[i]->thread, NULL);
> + igt_assert_eq(ret, 0);
> }
> }
>
>
Cool.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list