[igt-dev] [PATCH i-g-t v4 10/11] tests/i915/vm_bind: Add gem_exec3_balancer test

Mon Oct 24 15:26:38 UTC 2022

On 21/10/2022 21:47, Niranjana Vishwanathapura wrote:
> On Fri, Oct 21, 2022 at 01:11:28PM -0700, Niranjana Vishwanathapura wrote:
>> On Fri, Oct 21, 2022 at 05:42:55PM +0100, Matthew Auld wrote:
>>> On 18/10/2022 08:17, Niranjana Vishwanathapura wrote:
>>>> To test parallel submissions support in execbuf3, port the subtest
>>>> gem_exec_balancer at parallel-ordering to a new gem_exec3_balancer test
>>>> and switch to execbuf3 ioctl.
>>>>
>>>> v2: use i915_vm_bind library functions
>>>>
>>>> Signed-off-by: Niranjana Vishwanathapura 
>>>> <niranjana.vishwanathapura at intel.com>
>>>> ---
>>>> tests/i915/gem_exec3_balancer.c | 473 ++++++++++++++++++++++++++++++++
>>>> tests/meson.build               |   7 +
>>>> 2 files changed, 480 insertions(+)
>>>> create mode 100644 tests/i915/gem_exec3_balancer.c
>>>>
>>>> diff --git a/tests/i915/gem_exec3_balancer.c 
>>>> b/tests/i915/gem_exec3_balancer.c
>>>> new file mode 100644
>>>> index 0000000000..34b2a6a0b3
>>>> --- /dev/null
>>>> +++ b/tests/i915/gem_exec3_balancer.c
>>>> @@ -0,0 +1,473 @@
>>>> +// SPDX-License-Identifier: MIT
>>>> +/*
>>>> + * Copyright © 2022 Intel Corporation
>>>> + */
>>>> +
>>>> +/** @file gem_exec3_balancer.c
>>>> + *
>>>> + * Load balancer tests with execbuf3.
>>>> + * Ported from gem_exec_balancer and made to work with
>>>> + * vm_bind and execbuf3.
>>>> + *
>>>> + */
>>>> +
>>>> +#include <poll.h>
>>>> +
>>>> +#include "i915/gem.h"
>>>> +#include "i915/i915_vm_bind.h"
>>>> +#include "i915/gem_engine_topology.h"
>>>> +#include "i915/gem_create.h"
>>>> +#include "i915/gem_vm.h"
>>>> +#include "igt.h"
>>>> +#include "igt_gt.h"
>>>> +#include "igt_perf.h"
>>>> +#include "igt_syncobj.h"
>>>> +
>>>> +IGT_TEST_DESCRIPTION("Exercise in-kernel load-balancing with 
>>>> execbuf3");
>>>> +
>>>> +#define INSTANCE_COUNT (1 << I915_PMU_SAMPLE_INSTANCE_BITS)
>>>> +
>>>> +static bool has_class_instance(int i915, uint16_t class, uint16_t 
>>>> instance)
>>>> +{
>>>> +    int fd;
>>>> +
>>>> +    fd = perf_i915_open(i915, I915_PMU_ENGINE_BUSY(class, instance));
>>>> +    if (fd >= 0) {
>>>> +        close(fd);
>>>> +        return true;
>>>> +    }
>>>> +
>>>> +    return false;
>>>> +}
>>>> +
>>>> +static struct i915_engine_class_instance *
>>>> +list_engines(int i915, uint32_t class_mask, unsigned int *out)
>>>> +{
>>>> +    unsigned int count = 0, size = 64;
>>>> +    struct i915_engine_class_instance *engines;
>>>> +
>>>> +    engines = malloc(size * sizeof(*engines));
>>>> +    igt_assert(engines);
>>>> +
>>>> +    for (enum drm_i915_gem_engine_class class = 
>>>> I915_ENGINE_CLASS_RENDER;
>>>> +         class_mask;
>>>> +         class++, class_mask >>= 1) {
>>>> +        if (!(class_mask & 1))
>>>> +            continue;
>>>> +
>>>> +        for (unsigned int instance = 0;
>>>> +             instance < INSTANCE_COUNT;
>>>> +             instance++) {
>>>> +            if (!has_class_instance(i915, class, instance))
>>>> +                continue;
>>>> +
>>>> +            if (count == size) {
>>>> +                size *= 2;
>>>> +                engines = realloc(engines,
>>>> +                          size * sizeof(*engines));
>>>> +                igt_assert(engines);
>>>> +            }
>>>> +
>>>> +            engines[count++] = (struct i915_engine_class_instance){
>>>> +                .engine_class = class,
>>>> +                .engine_instance = instance,
>>>> +            };
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (!count) {
>>>> +        free(engines);
>>>> +        engines = NULL;
>>>> +    }
>>>> +
>>>> +    *out = count;
>>>> +    return engines;
>>>> +}
>>>> +
>>>> +static bool has_perf_engines(int i915)
>>>> +{
>>>> +    return i915_perf_type_id(i915);
>>>> +}
>>>> +
>>>> +static intel_ctx_cfg_t
>>>> +ctx_cfg_for_engines(const struct i915_engine_class_instance *ci,
>>>> +            unsigned int count)
>>>> +{
>>>> +    intel_ctx_cfg_t cfg = { };
>>>> +    unsigned int i;
>>>> +
>>>> +    for (i = 0; i < count; i++)
>>>> +        cfg.engines[i] = ci[i];
>>>> +    cfg.num_engines = count;
>>>> +
>>>> +    return cfg;
>>>> +}
>>>> +
>>>> +static const intel_ctx_t *
>>>> +ctx_create_engines(int i915, const struct 
>>>> i915_engine_class_instance *ci,
>>>> +           unsigned int count)
>>>> +{
>>>> +    intel_ctx_cfg_t cfg = ctx_cfg_for_engines(ci, count);
>>>> +    return intel_ctx_create(i915, &cfg);
>>>> +}
>>>> +
>>>> +static void check_bo(int i915, uint32_t handle, unsigned int expected,
>>>> +             bool wait)
>>>> +{
>>>> +    uint32_t *map;
>>>> +
>>>> +    map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_READ);
>>>> +    if (wait)
>>>> +        gem_set_domain(i915, handle, I915_GEM_DOMAIN_CPU,
>>>> +                   I915_GEM_DOMAIN_CPU);
>>>
>>> Hmm, does the wait still work as expected in set_domain() or does 
>>> that ignore BOOKKEEP, in which case this doesn't do anything with 
>>> eb3? Perhaps can be deleted? Also slightly worried about the dirty 
>>> tracking, and whether this test cares...
>>
>> I think we still need this. gem_set_domain() will correctly wait for
>> implicit dependency tracking only and hence will ignore BOOKKEEP. As
>> there is no implicit dependency tracking with VM_BIND, user has to
>> do an explicit wait. This test is properly waiting for execution fence
>> status before calling this function. Hence I think we should be good.

And in this test there should be no implicit dependencies, right?

>>
> 
> So, probably instead of naming it 'wait', we should call it 'flush'
> or something for calling get_set_domain()?

It doesn't look like it flushes anything either here, AFAICT. But 
looking again, it doesn't seem like the test cares about check_bo() 
performing a flush. So it should be fine to just drop the set_domain()?

> 
> Niranjana
> 
>> Regards,
>> Niranjana
>>
>>>
>>>> +    igt_assert_eq(map[0], expected);
>>>> +    munmap(map, 4096);
>>>> +}
>>>> +
>>>> +static struct drm_i915_query_engine_info *query_engine_info(int i915)
>>>> +{
>>>> +    struct drm_i915_query_engine_info *engines;
>>>> +
>>>> +#define QUERY_SIZE    0x4000
>>>> +    engines = malloc(QUERY_SIZE);
>>>> +    igt_assert(engines);
>>>> +    memset(engines, 0, QUERY_SIZE);
>>>> +    igt_assert(!__gem_query_engines(i915, engines, QUERY_SIZE));
>>>> +#undef QUERY_SIZE
>>>> +
>>>> +    return engines;
>>>> +}
>>>> +
>>>> +/* This function only works if siblings contains all instances of a 
>>>> class */
>>>> +static void logical_sort_siblings(int i915,
>>>> +                  struct i915_engine_class_instance *siblings,
>>>> +                  unsigned int count)
>>>> +{
>>>> +    struct i915_engine_class_instance *sorted;
>>>> +    struct drm_i915_query_engine_info *engines;
>>>> +    unsigned int i, j;
>>>> +
>>>> +    sorted = calloc(count, sizeof(*sorted));
>>>> +    igt_assert(sorted);
>>>> +
>>>> +    engines = query_engine_info(i915);
>>>> +
>>>> +    for (j = 0; j < count; ++j) {
>>>> +        for (i = 0; i < engines->num_engines; ++i) {
>>>> +            if (siblings[j].engine_class ==
>>>> +                engines->engines[i].engine.engine_class &&
>>>> +                siblings[j].engine_instance ==
>>>> +                engines->engines[i].engine.engine_instance) {
>>>> +                uint16_t logical_instance =
>>>> +                    engines->engines[i].logical_instance;
>>>> +
>>>> +                igt_assert(logical_instance < count);
>>>> +                igt_assert(!sorted[logical_instance].engine_class);
>>>> +                igt_assert(!sorted[logical_instance].engine_instance);
>>>> +
>>>> +                sorted[logical_instance] = siblings[j];
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +        igt_assert(i != engines->num_engines);
>>>> +    }
>>>> +
>>>> +    memcpy(siblings, sorted, sizeof(*sorted) * count);
>>>> +    free(sorted);
>>>> +    free(engines);
>>>> +}
>>>> +
>>>> +static bool fence_busy(int fence)
>>>> +{
>>>> +    return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Always reading from engine instance 0, with GuC submission the 
>>>> values are the
>>>> + * same across all instances. Execlists they may differ but quite 
>>>> unlikely they
>>>> + * would be and if they are we can live with this.
>>>> + */
>>>> +static unsigned int get_timeslice(int i915,
>>>> +                  struct i915_engine_class_instance engine)
>>>> +{
>>>> +    unsigned int val;
>>>> +
>>>> +    switch (engine.engine_class) {
>>>> +    case I915_ENGINE_CLASS_RENDER:
>>>> +        gem_engine_property_scanf(i915, "rcs0", 
>>>> "timeslice_duration_ms",
>>>> +                      "%d", &val);
>>>> +        break;
>>>> +    case I915_ENGINE_CLASS_COPY:
>>>> +        gem_engine_property_scanf(i915, "bcs0", 
>>>> "timeslice_duration_ms",
>>>> +                      "%d", &val);
>>>> +        break;
>>>> +    case I915_ENGINE_CLASS_VIDEO:
>>>> +        gem_engine_property_scanf(i915, "vcs0", 
>>>> "timeslice_duration_ms",
>>>> +                      "%d", &val);
>>>> +        break;
>>>> +    case I915_ENGINE_CLASS_VIDEO_ENHANCE:
>>>> +        gem_engine_property_scanf(i915, "vecs0", 
>>>> "timeslice_duration_ms",
>>>> +                      "%d", &val);
>>>> +        break;
>>>> +    }
>>>> +
>>>> +    return val;
>>>> +}
>>>> +
>>>> +static uint64_t gettime_ns(void)
>>>> +{
>>>> +    struct timespec current;
>>>> +    clock_gettime(CLOCK_MONOTONIC, &current);
>>>> +    return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
>>>> +}
>>>> +
>>>> +static bool syncobj_busy(int i915, uint32_t handle)
>>>> +{
>>>> +    bool result;
>>>> +    int sf;
>>>> +
>>>> +    sf = syncobj_handle_to_fd(i915, handle,
>>>> +                  DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE);
>>>> +    result = poll(&(struct pollfd){sf, POLLIN}, 1, 0) == 0;
>>>> +    close(sf);
>>>> +
>>>> +    return result;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Ensure a parallel submit actually runs on HW in parallel by 
>>>> putting on a
>>>> + * spinner on 1 engine, doing a parallel submit, and parallel 
>>>> submit is blocked
>>>> + * behind spinner.
>>>> + */
>>>> +static void parallel_ordering(int i915, unsigned int flags)
>>>> +{
>>>> +    uint32_t vm_id;
>>>> +    int class;
>>>> +
>>>> +    vm_id = gem_vm_create_in_vm_bind_mode(i915);
>>>> +
>>>> +    for (class = 0; class < 32; class++) {
>>>> +        struct drm_i915_gem_timeline_fence exec_fence[32] = { };
>>>> +        const intel_ctx_t *ctx = NULL, *spin_ctx = NULL;
>>>> +        uint64_t fence_value = 0, batch_addr[32] = { };
>>>> +        struct i915_engine_class_instance *siblings;
>>>> +        uint32_t exec_syncobj, bind_syncobj[32];
>>>> +        struct drm_i915_gem_execbuffer3 execbuf;
>>>> +        uint32_t batch[16], obj[32];
>>>> +        intel_ctx_cfg_t cfg;
>>>> +        unsigned int count;
>>>> +        igt_spin_t *spin;
>>>> +        uint64_t ahnd;
>>>> +        int i = 0;
>>>> +
>>>> +        siblings = list_engines(i915, 1u << class, &count);
>>>> +        if (!siblings)
>>>> +            continue;
>>>> +
>>>> +        if (count < 2) {
>>>> +            free(siblings);
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        logical_sort_siblings(i915, siblings, count);
>>>> +
>>>> +        memset(&cfg, 0, sizeof(cfg));
>>>> +        cfg.parallel = true;
>>>> +        cfg.num_engines = 1;
>>>> +        cfg.width = count;
>>>> +        memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
>>>> +
>>>> +        if (__intel_ctx_create(i915, &cfg, &ctx)) {
>>>> +            free(siblings);
>>>> +            continue;
>>>> +        }
>>>> +        gem_context_set_vm(i915, ctx->id, vm_id);
>>>> +
>>>> +        batch[i] = MI_ATOMIC | MI_ATOMIC_INC;
>>>> +#define TARGET_BO_OFFSET    (0x1 << 16)
>>>> +        batch[++i] = TARGET_BO_OFFSET;
>>>> +        batch[++i] = 0;
>>>> +        batch[++i] = MI_BATCH_BUFFER_END;
>>>> +
>>>> +        obj[0] = gem_create(i915, 4096);
>>>> +        bind_syncobj[0] = syncobj_create(i915, 0);
>>>> +        exec_fence[0].handle = bind_syncobj[0];
>>>> +        exec_fence[0].flags = I915_TIMELINE_FENCE_WAIT;
>>>> +        i915_vm_bind(i915, vm_id, TARGET_BO_OFFSET, obj[0], 0, 
>>>> 4096, bind_syncobj[0], 0);
>>>> +
>>>> +        for (i = 1; i < count + 1; ++i) {
>>>> +            obj[i] = gem_create(i915, 4096);
>>>> +            gem_write(i915, obj[i], 0, batch, sizeof(batch));
>>>> +
>>>> +            batch_addr[i - 1] = TARGET_BO_OFFSET * (i + 1);
>>>> +            bind_syncobj[i] = syncobj_create(i915, 0);
>>>> +            exec_fence[i].handle = bind_syncobj[i];
>>>> +            exec_fence[i].flags = I915_TIMELINE_FENCE_WAIT;
>>>> +            i915_vm_bind(i915, vm_id, batch_addr[i - 1], obj[i], 0, 
>>>> 4096, bind_syncobj[i], 0);
>>>> +        }
>>>> +
>>>> +        exec_syncobj = syncobj_create(i915, 0);
>>>> +        exec_fence[i].handle = exec_syncobj;
>>>> +        exec_fence[i].flags = I915_TIMELINE_FENCE_SIGNAL;
>>>> +
>>>> +        memset(&execbuf, 0, sizeof(execbuf));
>>>> +        execbuf.ctx_id = ctx->id,
>>>> +        execbuf.batch_address = to_user_pointer(batch_addr),
>>>> +        execbuf.fence_count = count + 2,
>>>> +        execbuf.timeline_fences = to_user_pointer(exec_fence),
>>>> +
>>>> +        /* Block parallel submission */
>>>> +        spin_ctx = ctx_create_engines(i915, siblings, count);
>>>> +        ahnd = get_simple_ahnd(i915, spin_ctx->id);
>>>> +        spin = __igt_spin_new(i915,
>>>> +                      .ahnd = ahnd,
>>>> +                      .ctx = spin_ctx,
>>>> +                      .engine = 0,
>>>> +                      .flags = IGT_SPIN_FENCE_OUT |
>>>> +                      IGT_SPIN_NO_PREEMPTION);
>>>> +
>>>> +        /* Wait for spinners to start */
>>>> +        usleep(5 * 10000);
>>>> +        igt_assert(fence_busy(spin->out_fence));
>>>> +
>>>> +        /* Submit parallel execbuf */
>>>> +        gem_execbuf3(i915, &execbuf);
>>>> +
>>>> +        /*
>>>> +         * Wait long enough for timeslcing to kick in but not
>>>> +         * preemption. Spinner + parallel execbuf should be
>>>> +         * active. Assuming default timeslice / preemption values, if
>>>> +         * these are changed it is possible for the test to fail.
>>>> +         */
>>>> +        usleep(get_timeslice(i915, siblings[0]) * 2);
>>>> +        igt_assert(fence_busy(spin->out_fence));
>>>> +        igt_assert(syncobj_busy(i915, exec_syncobj));
>>>> +        check_bo(i915, obj[0], 0, false);
>>>> +
>>>> +        /*
>>>> +         * End spinner and wait for spinner + parallel execbuf
>>>> +         * to compelte.
>>>> +         */
>>>> +        igt_spin_end(spin);
>>>> +        igt_assert(syncobj_timeline_wait(i915, &exec_syncobj, 
>>>> &fence_value, 1,
>>>> +                         gettime_ns() + (2 * NSEC_PER_SEC),
>>>> +                         DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, 
>>>> NULL));
>>>> +        igt_assert(!syncobj_busy(i915, exec_syncobj));
>>>> +        syncobj_destroy(i915, exec_syncobj);
>>>> +        for (i = 0; i < count + 1; ++i)
>>>> +            syncobj_destroy(i915, bind_syncobj[i]);
>>>> +        check_bo(i915, obj[0], count, true);
>>>> +
>>>> +        /* Clean up */
>>>> +        intel_ctx_destroy(i915, ctx);
>>>> +        intel_ctx_destroy(i915, spin_ctx);
>>>> +        i915_vm_unbind(i915, vm_id, TARGET_BO_OFFSET, 4096);
>>>> +        for (i = 1; i < count + 1; ++i)
>>>> +            i915_vm_unbind(i915, vm_id, batch_addr[i - 1], 4096);
>>>> +
>>>> +        for (i = 0; i < count + 1; ++i)
>>>> +            gem_close(i915, obj[i]);
>>>> +        free(siblings);
>>>> +        igt_spin_free(i915, spin);
>>>> +        put_ahnd(ahnd);
>>>> +    }
>>>> +
>>>> +    gem_vm_destroy(i915, vm_id);
>>>> +}
>>>> +
>>>> +static bool has_load_balancer(int i915)
>>>> +{
>>>> +    const intel_ctx_cfg_t cfg = {
>>>> +        .load_balance = true,
>>>> +        .num_engines = 1,
>>>> +    };
>>>> +    const intel_ctx_t *ctx = NULL;
>>>> +    int err;
>>>> +
>>>> +    err = __intel_ctx_create(i915, &cfg, &ctx);
>>>> +    intel_ctx_destroy(i915, ctx);
>>>> +
>>>> +    return err == 0;
>>>> +}
>>>> +
>>>> +static bool has_logical_mapping(int i915)
>>>> +{
>>>> +    struct drm_i915_query_engine_info *engines;
>>>> +    unsigned int i;
>>>> +
>>>> +    engines = query_engine_info(i915);
>>>> +
>>>> +    for (i = 0; i < engines->num_engines; ++i)
>>>> +        if (!(engines->engines[i].flags &
>>>> +             I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE)) {
>>>> +            free(engines);
>>>> +            return false;
>>>> +        }
>>>> +
>>>> +    free(engines);
>>>> +    return true;
>>>> +}
>>>> +
>>>> +static bool has_parallel_execbuf(int i915)
>>>> +{
>>>> +    intel_ctx_cfg_t cfg = {
>>>> +        .parallel = true,
>>>> +        .num_engines = 1,
>>>> +    };
>>>> +    const intel_ctx_t *ctx = NULL;
>>>> +    int err;
>>>> +
>>>> +    for (int class = 0; class < 32; class++) {
>>>> +        struct i915_engine_class_instance *siblings;
>>>> +        unsigned int count;
>>>> +
>>>> +        siblings = list_engines(i915, 1u << class, &count);
>>>> +        if (!siblings)
>>>> +            continue;
>>>> +
>>>> +        if (count < 2) {
>>>> +            free(siblings);
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        logical_sort_siblings(i915, siblings, count);
>>>> +
>>>> +        cfg.width = count;
>>>> +        memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
>>>> +        free(siblings);
>>>> +
>>>> +        err = __intel_ctx_create(i915, &cfg, &ctx);
>>>> +        intel_ctx_destroy(i915, ctx);
>>>> +
>>>> +        return err == 0;
>>>> +    }
>>>> +
>>>> +    return false;
>>>> +}
>>>> +
>>>> +igt_main
>>>> +{
>>>> +    int i915 = -1;
>>>> +
>>>> +    igt_fixture {
>>>> +        i915 = drm_open_driver(DRIVER_INTEL);
>>>> +        igt_require_gem(i915);
>>>> +
>>>> +        gem_require_contexts(i915);
>>>> +        igt_require(i915_vm_bind_version(i915) == 1);
>>>> +        igt_require(gem_has_engine_topology(i915));
>>>> +        igt_require(has_load_balancer(i915));
>>>> +        igt_require(has_perf_engines(i915));
>>>> +    }
>>>> +
>>>> +    igt_subtest_group {
>>>> +        igt_fixture {
>>>> +            igt_require(has_logical_mapping(i915));
>>>> +            igt_require(has_parallel_execbuf(i915));
>>>> +        }
>>>> +
>>>> +        igt_describe("Ensure a parallel submit actually runs in 
>>>> parallel");
>>>> +        igt_subtest("parallel-ordering")
>>>> +            parallel_ordering(i915, 0);
>>>> +    }
>>>> +}
>>>> diff --git a/tests/meson.build b/tests/meson.build
>>>> index 50de8b7e89..6ca5a94282 100644
>>>> --- a/tests/meson.build
>>>> +++ b/tests/meson.build
>>>> @@ -375,6 +375,13 @@ test_executables += 
>>>> executable('gem_exec_balancer', 'i915/gem_exec_balancer.c',
>>>>        install : true)
>>>> test_list += 'gem_exec_balancer'
>>>> +test_executables += executable('gem_exec3_balancer', 
>>>> 'i915/gem_exec3_balancer.c',
>>>> +       dependencies : test_deps + [ lib_igt_perf ],
>>>> +       install_dir : libexecdir,
>>>> +       install_rpath : libexecdir_rpathdir,
>>>> +       install : true)
>>>> +test_list += 'gem_exec3_balancer'
>>>> +
>>>> test_executables += executable('gem_mmap_offset',
>>>>        join_paths('i915', 'gem_mmap_offset.c'),
>>>>        dependencies : test_deps + [ libatomic ],