[igt-dev] [PATCH i-g-t 3/3] i915/gem_exec_scheduler: Exercise priority inversion from resource contention

Fri Nov 8 21:13:13 UTC 2019

On Fri, Nov 8, 2019 at 9:49 PM Chris Wilson <chris at chris-wilson.co.uk> wrote:
>
> One of the hardest priority inversion tasks to both handle and to
> simulate in testing is inversion due to resource contention. The
> challenge is that a high priority context should never be blocked by a
> low priority context, even if both are starving for resources --
> ideally, at least for some RT OSes, the higher priority context has
> first pick of the meagre resources so that it can be executed with
> minimum latency.
>
> userfaultfd allows us to handle a page fault in userspace, and so
> arbitrary impose a delay on the fault handler, creating a situation
> where a low priority context is blocked waiting for the fault. This
> blocked context should not prevent a high priority context from being
> executed. While the userfault tries to emulate a slow fault (e.g. from a
> failing swap device), it is unfortunately limited to a single object
> type: the userptr. Hopefully, we will find other ways to impose other
> starvation conditions on global resources.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

So rt-ww_mutexes?

I don't think we want/should do that on the first round of rolling out
ww_mutex in i915. And probably should be handled within ww_mutex&cpu
scheduler core code as RT addition (i.e. if you want RT gpu workloads,
then you need to tell both the cpu and gpu scheduler to treat you
accordingly). But I do think it's possible. Rough idea would be to not
just have a global ticket queue, but also take the rt priority into
account. Prio-inversion boosting should already be possible with the
normal rt-mutex code. I think even boosting for rt-ww_mutexes should
be fine:
- if a non-rt task gets boosted because it holds a non-ww_mutex lock,
then it either needs to start out with a new ww_acquire_ctx ticket
(which will already be boosted), or it can't take new ww_mutex locks
for an existing ticket, since that would be outside the non-ww_mutex
and create a locking inversion (ww_acquire_ctx -> non-ww_mutex ->
ww_mutex goes boom). Hence I don't think we need to retroactively
boost the ticket itself if the contended lock that causes the boosting
is a non-ww_mutex.
- if a non-rt task is holding a ww_mutex lock (which an rt task wants
to acquire), then boosting will only need to happen until it drops
that lock. And we can force that by returning EDEADLCK on the next
ww_mutex_lock, hence again we don't need to be able to boost the
existing ticket itself (since the boosted task can be prevented from
acquiring more ww_mutexes of the same class), only the normal mutex
boosting.

If this ad-hoc analysis is correct and we really don't need to boost
existing tickets, then the normal ww_mutex algo should keep working.
Only difference is that all rt-priority tasks will be able to jump the
queue. Simplest way to implement that would be to shift the rt
priority into the high bits of the ticket (and make sure we handle
wrap-around correctly, since that will happen quite regularly with
this approach, between a highest-rt-priority task and a non-rt task).

What I'm not sure about is if a task gets bosted while hodling a
normal mutex, and before releasing that mutex, grabs a ww_acquire_ctx
(with a boosted ticket), and then drops the normal mutex, whether we
can also drop the boosting of the ticket (the ticket would always jump
the queue to its boosted frequence when we grab it, but then it needs
to be fixed to make sure the ww backoff algo works). But that's a
rather unusual locking scheme, and of the 3 ww_mutex classes we have
in the kernel thus far (drm_modeset_lock, dma_resv and the clock tree
graph lock thing) none do something silly like this. So probably can
be ignored.

Adding Maarten and Peter to this, so they can mentally prepare that
maybe we'll go totally crazy :-)

Cheers, Daniel

> ---
>  tests/i915/gem_exec_schedule.c | 155 +++++++++++++++++++++++++++++++++
>  1 file changed, 155 insertions(+)
>
> diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
> index 84581bffe..0af7b4c6d 100644
> --- a/tests/i915/gem_exec_schedule.c
> +++ b/tests/i915/gem_exec_schedule.c
> @@ -23,10 +23,16 @@
>
>  #include "config.h"
>
> +#include <linux/userfaultfd.h>
> +
> +#include <pthread.h>
>  #include <sys/poll.h>
>  #include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
>  #include <sched.h>
>  #include <signal.h>
> +#include <unistd.h>
>
>  #include "igt.h"
>  #include "igt_rand.h"
> @@ -1625,6 +1631,152 @@ static void test_pi_ringfull(int fd, unsigned int engine, unsigned int flags)
>         munmap(result, 4096);
>  }
>
> +static int userfaultfd(int flags)
> +{
> +       return syscall(SYS_userfaultfd, flags);
> +}
> +
> +struct ufd_thread {
> +       pthread_t thread;
> +       uint32_t batch;
> +       uint32_t *page;
> +       unsigned int engine;
> +       int i915;
> +};
> +
> +static uint32_t create_userptr(int i915, void *page)
> +{
> +       uint32_t handle;
> +
> +       gem_userptr(i915, page, 4096, 0, 0, &handle);
> +       return handle;
> +}
> +
> +static void *ufd_thread(void *arg)
> +{
> +       struct ufd_thread *t = arg;
> +       struct drm_i915_gem_exec_object2 obj[2] = {
> +               { .handle = create_userptr(t->i915, t->page) },
> +               { .handle = t->batch },
> +       };
> +       struct drm_i915_gem_execbuffer2 eb = {
> +               .buffers_ptr = to_user_pointer(obj),
> +               .buffer_count = ARRAY_SIZE(obj),
> +               .flags = t->engine,
> +               .rsvd1 = gem_context_create(t->i915),
> +       };
> +       gem_context_set_priority(t->i915, eb.rsvd1, MIN_PRIO);
> +
> +       igt_debug("submitting fault\n");
> +       gem_execbuf(t->i915, &eb);
> +       gem_sync(t->i915, obj[0].handle);
> +       gem_close(t->i915, obj[0].handle);
> +
> +       gem_context_destroy(t->i915, eb.rsvd1);
> +
> +       t->i915 = -1;
> +       return NULL;
> +}
> +
> +static void test_pi_userfault(int i915, unsigned int engine)
> +{
> +       struct uffdio_api api = { .api = UFFD_API };
> +       struct uffdio_register reg;
> +       struct uffdio_copy copy;
> +       struct uffd_msg msg;
> +       struct ufd_thread t;
> +       char poison[4096];
> +       int ufd;
> +
> +       /*
> +        * Resource contention can easily lead to priority inversion problems,
> +        * that we wish to avoid. Here, we simulate one simple form of resource
> +        * starvation by using an arbitrary slow userspace fault handler to cause
> +        * the low priority context to block waiting for its resource. While it
> +        * is blocked, it should not prevent a higher priority context from
> +        * executing.
> +        *
> +        * This is only a very simple scenario, in more general tests we will
> +        * need to simulate contention on the shared resource such that both
> +        * low and high priority contexts are starving and must fight over
> +        * the meagre resources. One step at a time.
> +        */
> +
> +       ufd = userfaultfd(0);
> +       igt_require_f(ufd != -1, "kernel support for userfaultfd\n");
> +       igt_require_f(ioctl(ufd, UFFDIO_API, &api) == 0 && api.api == UFFD_API,
> +                     "userfaultfd API v%lld:%lld\n", UFFD_API, api.api);
> +
> +       t.i915 = i915;
> +       t.engine = engine;
> +
> +       t.page = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
> +       igt_assert(t.page != MAP_FAILED);
> +
> +       t.batch = gem_create(i915, 4096);
> +       memset(poison, 0xff, sizeof(poison));
> +       gem_write(i915, t.batch, 0, poison, 4096);
> +
> +       /* Register our fault handler for t.page */
> +       memset(&reg, 0, sizeof(reg));
> +       reg.mode = UFFDIO_REGISTER_MODE_MISSING;
> +       reg.range.start = to_user_pointer(t.page);
> +       reg.range.len = 4096;
> +       do_ioctl(ufd, UFFDIO_REGISTER, &reg);
> +       igt_assert(reg.ioctls == UFFD_API_RANGE_IOCTLS);
> +
> +       /* Kick off the low priority submission */
> +       igt_assert(pthread_create(&t.thread, NULL, ufd_thread, &t) == 0);
> +
> +       /* Wait until the low priority thread is blocked on a fault */
> +       igt_assert_eq(read(ufd, &msg, sizeof(msg)), sizeof(msg));
> +       igt_assert_eq(msg.event, UFFD_EVENT_PAGEFAULT);
> +       igt_assert(from_user_pointer(msg.arg.pagefault.address) == t.page);
> +
> +       /* While the low priority context is blocked; execute a vip */
> +       if (1) {
> +               const uint32_t bbe = MI_BATCH_BUFFER_END;
> +               struct drm_i915_gem_exec_object2 obj = {
> +                       .handle = t.batch,
> +               };
> +               struct pollfd pfd;
> +               struct drm_i915_gem_execbuffer2 eb = {
> +                       .buffers_ptr = to_user_pointer(&obj),
> +                       .buffer_count = 1,
> +                       .flags = engine | I915_EXEC_FENCE_OUT,
> +                       .rsvd1 = gem_context_create(i915),
> +               };
> +               gem_context_set_priority(i915, eb.rsvd1, MAX_PRIO);
> +               gem_write(i915, obj.handle, 0, &bbe, sizeof(bbe));
> +               gem_execbuf_wr(i915, &eb);
> +
> +               memset(&pfd, 0, sizeof(pfd));
> +               pfd.fd = eb.rsvd2 >> 32;
> +               pfd.events = POLLIN;
> +               poll(&pfd, 1, -1);
> +               igt_assert_eq(sync_fence_status(pfd.fd), 1);
> +               close(pfd.fd);
> +
> +               gem_context_destroy(i915, eb.rsvd1);
> +       }
> +
> +       /* Confirm the low priority context is still waiting */
> +       igt_assert_eq(t.i915, i915);
> +
> +       /* Service the fault; releasing the low priority context */
> +       memset(&copy, 0, sizeof(copy));
> +       copy.dst = msg.arg.pagefault.address;
> +       copy.src = to_user_pointer(memset(poison, 0xc5, sizeof(poison)));
> +       copy.len = 4096;
> +       do_ioctl(ufd, UFFDIO_COPY, &copy);
> +
> +       pthread_join(t.thread, NULL);
> +
> +       gem_close(i915, t.batch);
> +       munmap(t.page, 4096);
> +       close(ufd);
> +}
> +
>  static void measure_semaphore_power(int i915)
>  {
>         struct rapl gpu, pkg;
> @@ -1864,6 +2016,9 @@ igt_main
>
>                                 igt_subtest_f("pi-common-%s", e->name)
>                                         test_pi_ringfull(fd, eb_ring(e), SHARED);
> +
> +                               igt_subtest_f("pi-userfault-%s", e->name)
> +                                       test_pi_userfault(fd, eb_ring(e));
>                         }
>                 }
>         }
> --
> 2.24.0
>
> _______________________________________________
> igt-dev mailing list
> igt-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/igt-dev

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch