[igt-dev] [PATCH i-g-t] tests/xe: Add a test for corner cases of eviction.

Thomas Hellström thomas.hellstrom at linux.intel.com
Fri Jun 2 10:09:40 UTC 2023


On 5/31/23 20:18, Maarten Lankhorst wrote:
> On 2023-05-31 11:53, Thomas Hellström wrote:
>> Hi,
>>
>>
>> On 5/30/23 16:55, Maarten Lankhorst wrote:
>>> Cc
>>>
>>> On 2023-05-29 14:48, Maarten Lankhorst wrote:
>>>> Gitlab issues 194 and 239 show some corner cases. In particular,
>>>> signal handling, pt eviction and losing track of external objects
>>>> on the list.
>>>>
>>>> The patch series to fix those was sent as
>>>> https://patchwork.freedesktop.org/series/118428/
>>>>
>>>> But because it's tricky to reproduce those issues, I had to add some
>>>> custom igt's to handle those.
>>>>
>>>> Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
>> Looks good overall, but It would be great with some more documentation so that if a test fails, it's easier to understand what is failing, see below.
>>
>>>> ---
>>>>    tests/meson.build     |   1 +
>>>>    tests/xe/xe_evicted.c | 313 ++++++++++++++++++++++++++++++++++++++++++
>>>>    2 files changed, 314 insertions(+)
>>>>    create mode 100644 tests/xe/xe_evicted.c
>>>>
>>>> diff --git a/tests/meson.build b/tests/meson.build
>>>> index 6551194fe..166937b6c 100644
>>>> --- a/tests/meson.build
>>>> +++ b/tests/meson.build
>>>> @@ -248,6 +248,7 @@ xe_progs = [
>>>>        'xe_dma_buf_sync',
>>>>        'xe_debugfs',
>>>>        'xe_evict',
>>>> +    'xe_evicted',
>>>>        'xe_exec_balancer',
>>>>        'xe_exec_basic',
>>>>        'xe_exec_compute_mode',
>>>> diff --git a/tests/xe/xe_evicted.c b/tests/xe/xe_evicted.c
>>>> new file mode 100644
>>>> index 000000000..3f5e6a82c
>>>> --- /dev/null
>>>> +++ b/tests/xe/xe_evicted.c
>>>> @@ -0,0 +1,313 @@
>>>> +// SPDX-License-Identifier: MIT
>>>> +/*
>>>> + * Copyright © 2023 Intel Corporation
>>>> + */
>>>> +
>>>> +/**
>>>> + * TEST: Test various eviction and ENOMEM corner cases.
>>>> + * Category: Software building block
>>>> + * Sub-category: VM_BIND / eviction
>>>> + * Test category: functionality test
>>>> + */
>>>> +
>>>> +#include "igt.h"
>>>> +#include "lib/igt_syncobj.h"
>>>> +#include "lib/intel_reg.h"
>>>> +#include "xe_drm.h"
>>>> +
>>>> +#include "xe/xe_ioctl.h"
>>>> +#include "xe/xe_query.h"
>>>> +#include "xe/xe_spin.h"
>>>> +#include <string.h>
>>>> +
>>>> +static void evict_mem(struct xe_device *xe)
>>>> +{
>>>> +    if (xe->has_vram)
>>>> +        igt_debugfs_write(xe->fd, "vram0_evict", "1");
>>>> +    else
>>>> +        igt_debugfs_write(xe->fd, "gtt_evict", "1");
>>>> +}
>>>> +
>>>> +static void execstress(struct xe_device *xe, uint32_t engine, uint32_t addr)
>>>> +{
>>>> +    struct timespec start, end;
>>>> +
>>>> +    igt_fork(child, 2) {
>>>> +        struct drm_xe_sync sync = {
>>>> +            .flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
>>>> +            .handle = syncobj_create(xe->fd, 0),
>>>> +        };
>>>> +
>>>> +        clock_gettime(CLOCK_MONOTONIC, &start);
>>>> +
>>>> +        do {
>>>> +            if (!child)
>>>> +                evict_mem(xe);
>>>> +            else
>>>> +                xe_exec_sync(xe->fd, engine, addr, &sync, 1);
>>>> +
>>>> +            clock_gettime(CLOCK_MONOTONIC, &end);
>>>> +        } while (end.tv_sec - start.tv_sec <= 3);
>>>> +
>>>> +        if (child)
>>>> +            igt_assert(syncobj_wait(xe->fd, &sync.handle, 1, INT64_MAX, 0, NULL));
>>>> +        syncobj_destroy(xe->fd, sync.handle);
>>>> +    }
>>>> +
>>>> +    igt_waitchildren();
>>>> +
>>>> +    /* Leave function in a known state */
>>>> +    evict_mem(xe);
>>>> +}
>>>> +
>>>> +/* See issue 194, unbinding first VMA may cause BO not to be locked correctly */
>>>> +static void test_multimap(struct xe_device *xe, bool external)
>>>> +{
>>>> +    uint64_t size = xe->default_alignment;
>>>> +    uint32_t vm = xe_vm_create(xe->fd, DRM_XE_VM_CREATE_SCRATCH_PAGE, 0);
>>>> +    uint32_t bo = xe_bo_create(xe->fd, 0, external ? 0 : vm, size);
>>>> +    uint32_t engine = xe_engine_create_class(xe->fd, vm, DRM_XE_ENGINE_CLASS_COPY);
>>>> +    uint32_t *map = xe_bo_map(xe->fd, bo, size);
>>>> +
>>>> +    *map = MI_BATCH_BUFFER_END;
>>>> +    munmap(map, size);
>>>> +
>>>> +    /* Create 2 mappings */
>>>> +    xe_vm_bind_sync(xe->fd, vm, bo, 0, 0, size);
>>>> +    xe_vm_bind_sync(xe->fd, vm, bo, 0, size, size);
>>>> +
>>>> +    execstress(xe, engine, size);
>>>> +
>>>> +    /* Unbind the first one, to possibly confuse stuff */
>>>> +    xe_vm_unbind_sync(xe->fd, vm, 0, 0, size);
>>>> +
>>>> +    execstress(xe, engine, size);
>>>> +
>>>> +    /* Teardown.. */
>>>> +    xe_engine_destroy(xe->fd, engine);
>>>> +    xe_vm_destroy(xe->fd, vm);
>>>> +    gem_close(xe->fd, bo);
>>>> +}
>>>> +
>>>> +static uint64_t avail_vram(struct xe_device **xe)
>>>> +{
>>>> +    int fd = (*xe)->fd;
>>>> +    int region_idx;
>>>> +
>>>> +    /* Refresh to get up-to-date values */
>>>> +    xe_device_put(fd);
>>>> +    *xe = xe_device_get(fd);
>>>> +
>>>> +    region_idx = ffs((*xe)->gts->gts[0].native_mem_regions) - 1;
>>>> +
>>>> +    return (*xe)->mem_usage->regions[region_idx].total_size -
>>>> +           (*xe)->mem_usage->regions[region_idx].used;
>>>> +}
>>>> +
>>>> +static void test_pt_eviction(struct xe_device *xe, bool external)
>> Could we have an overall description on what this test does and how it does it?
> I put a small description in the test comments, basically try to force a pt eviction.
>>>> +{
>>>> +    uint32_t bo, i;
>>>> +    uint64_t size, full_size, full_addr, addr, avail;
>>>> +    uint32_t vm;
>>>> +    uint32_t engine;
>>>> +    uint32_t *map;
>>>> +    struct drm_xe_exec exec = {};
>>>> +    igt_require(xe->has_vram);
>>>> +
>>>> +    vm = xe_vm_create(xe->fd, 0, 0);
>>>> +    engine = xe_engine_create_class(xe->fd, vm, DRM_XE_ENGINE_CLASS_COPY);
>>>> +
>>>> +    /* Refresh xe_device size after creating vm + engine */
>>>> +    evict_mem(xe);
>>>> +    full_size = (avail_vram(&xe) - (2 << 20ULL)) & ~((2 << 20ULL) - 1);
>> And what we are doing here and what the magical shifts come from?
>   Should probably add a define for 2 megabyte instead. 1 << 20 is 1 mb. :)
>>>> +
>>>> +    bo = xe_bo_create(xe->fd, 0, external ? 0 : vm, full_size);
>>>> +
>>>> +    /* Only map what we need.. */
>>>> +    map = xe_bo_map(xe->fd, bo, xe->default_alignment);
>>>> +    *map = MI_BATCH_BUFFER_END;
>>>> +    munmap(map, xe->default_alignment);
>>>> +
>>>> +    /* Stuff bo at the end, so we can use start for partial bo's */
>>>> +    full_addr = (1ULL << (uint64_t)xe_va_bits(xe->fd)) - full_size;
>>>> +    xe_vm_bind_sync(xe->fd, vm, bo, 0, full_addr, full_size);
>>>> +
>>>> +    exec.engine_id = engine;
>>>> +    exec.num_batch_buffer = 1;
>>>> +    exec.address = full_addr;
>>>> +
>>>> +    /* Now map partially at 0, and see how many pagetables we can still fill.. */
>>>> +    addr = 0;
>>>> +    size = xe->default_alignment;
>>>> +    xe_vm_bind_sync(xe->fd, vm, bo, 0, addr, size);
>>>> +
>>>> +    xe_exec_sync(xe->fd, engine, full_addr, NULL, 0);
>>>> +
>>>> +    avail = full_size >> 12;
>>>> +    while (1) {
>>>> +        uint64_t new_avail = avail_vram(&xe) >> 12;
>>>> +
>>>> +        if (new_avail >= avail)
>>>> +            igt_info("TTM delayed destroy may have freed some memory\n");
>>>> +        avail = new_avail;
>>>> +
>>>> +        igt_info("%"PRIu64" available 4K pages left\n", avail);
>>>> +        if (!avail)
>>>> +            break;
>>>> +
>>>> +        if (avail == 1) {
>>>> +            xe_vm_bind_sync(xe->fd, vm, bo, 0, 2 << 20ULL, size);
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        addr += 1 << 30ULL;
>>>> +
>>>> +        /* Remove 1 for second level PT */
>>>> +        avail--;
>>>> +
>>>> +        for (i = 0; i < min((uint64_t)512, avail); i++) {
>>>> +            uint64_t this_addr = addr + (i << 21);
>>>> +
>>>> +            xe_vm_bind_sync(xe->fd, vm, bo, 0, this_addr, size);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* Verify that with 0 memory available, exec still completes */
>>>> +    xe_exec_sync(xe->fd, engine, full_addr, NULL, 0);
>>>> +
>>>> +    /* Bind 1 more to go over the edge, this should never succeed.. */
>>>> +    if (__xe_vm_bind(xe->fd, vm, 0, bo, 0, 4 << 20ULL, size,
>>>> +             XE_VM_BIND_OP_MAP, NULL, 0, 0, 0) == 0) {
>>>> +
>>>> +        /* Boom! */
>>>> +        igt_assert(igt_ioctl(xe->fd, DRM_IOCTL_XE_EXEC, &exec) < 0 &&
>>>> +               errno == ENOMEM);
>>>> +
>>>> +        xe_vm_unbind_sync(xe->fd, vm, 0, 4 << 20ULL, size);
>>>> +    } else {
>>>> +        igt_assert(errno == ENOMEM);
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * After evicting, we may end up creating new page tables,
>>>> +     * so this should fail..
>>>> +     */
>>>> +    evict_mem(xe);
>>>> +    igt_assert(igt_ioctl(xe->fd, DRM_IOCTL_XE_EXEC, &exec) < 0 &&
>>>> +           errno == ENOMEM);
>>>> +
>>>> +    /* Cleanup */
>>>> +    xe_engine_destroy(xe->fd, engine);
>>>> +    xe_vm_destroy(xe->fd, vm);
>>>> +    gem_close(xe->fd, bo);
>>>> +}
>>>> +
>>>> +/* Test OOM with signal handler enabled, to mimic X.org (issue #239) */
>> Same thing here, what's done and how is it done? Are we actually getting a signal here, or just wait interruptible and may or may not get a signal?
> We attempt to cause a signal through igt_while_interruptible, it changes the igt_ioctl pointer from drmIoctl to sig_ioctl,
>
> which attempts to cause as many EINTR's as possible. That's why igt uses igt_ioctl internally and not drmIoctl.

Ah, ok. Thanks for the explanation.

/Thomas

>
>
>> Thanks,
>>
>> Thomas
> ~Maarten


More information about the igt-dev mailing list