[Intel-gfx] [PATCH v2 2/2] drm/i915: add in-kernel blitter client

Tue May 7 11:57:02 UTC 2019

Quoting Matthew Auld (2019-05-07 11:55:57)
> The plan is to use the blitter engine for async object clearing when
> using local memory, but before we can move the worker to get_pages() we
> have to first tame some more of our struct_mutex usage. With this in
> mind we should be able to upstream the object clearing as some
> selftests, which should serve as a guinea pig for the ongoing locking
> rework and upcoming asyc get_pages() framework.
> 
> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
> ---
> +struct clear_pages_work {
> +       struct dma_fence dma;
> +       struct dma_fence_cb cb;
> +       struct i915_sw_fence wait;
> +       struct work_struct work;
> +       struct irq_work irq_work;
> +       struct i915_sleeve *sleeve;
> +       struct intel_context *ce;
> +       u32 value;
> +};
> +
> +static const char *clear_pages_work_driver_name(struct dma_fence *fence)
> +{
> +       return DRIVER_NAME;
> +}
> +
> +static const char *clear_pages_work_timeline_name(struct dma_fence *fence)
> +{
> +       return "clear";
> +}
> +
> +static void clear_pages_work_release(struct dma_fence *fence)
> +{
> +       struct clear_pages_work *w = container_of(fence, typeof(*w), dma);
> +
> +       destroy_sleeve(w->sleeve);
> +
> +       i915_sw_fence_fini(&w->wait);
> +
> +       BUILD_BUG_ON(offsetof(typeof(*w), dma));
> +       dma_fence_free(&w->dma);
> +}
> +
> +static const struct dma_fence_ops clear_pages_work_ops = {
> +       .get_driver_name = clear_pages_work_driver_name,
> +       .get_timeline_name = clear_pages_work_timeline_name,
> +       .release = clear_pages_work_release,
> +};
> +
> +static void clear_pages_signal_irq_worker(struct irq_work *work)
> +{
> +       struct clear_pages_work *w = container_of(work, typeof(*w), irq_work);
> +
> +       dma_fence_signal(&w->dma);
> +       dma_fence_put(&w->dma);
> +}
> +
> +static void clear_pages_dma_fence_cb(struct dma_fence *fence,
> +                                    struct dma_fence_cb *cb)
> +{
> +       struct clear_pages_work *w = container_of(cb, typeof(*w), cb);
> +
> +       /*
> +        * Push the signalling of the fence into yet another worker to avoid
> +        * the nightmare locking around the fence spinlock.
> +        */
> +       irq_work_queue(&w->irq_work);
> +}
> +
> +static void clear_pages_worker(struct work_struct *work)
> +{
> +       struct clear_pages_work *w = container_of(work, typeof(*w), work);
> +       struct drm_i915_private *i915 = w->ce->gem_context->i915;
> +       struct drm_i915_gem_object *obj = w->sleeve->obj;
> +       struct i915_vma *vma = w->sleeve->vma;
> +       struct i915_request *rq;
> +       int err;
> +
> +       if (w->dma.error)
> +               goto out_signal;
> +
> +       if (obj->cache_dirty) {
> +               obj->write_domain = 0;
> +               if (i915_gem_object_has_struct_page(obj))
> +                       drm_clflush_sg(w->sleeve->pages);
> +               obj->cache_dirty = false;

Interesting. If we have no struct_page, can we be cache_dirty here?
That might be a useful thought exercise and worth verifying at odd
points.

> +       }
> +
> +       mutex_lock(&i915->drm.struct_mutex);

This will be come vm->mutex. But that's why we need this patch so that
we can trim down the locking with a working test.

> +       err = i915_vma_pin(vma, 0, 0, PIN_USER);
> +       if (unlikely(err)) {
> +               mutex_unlock(&i915->drm.struct_mutex);
> +               dma_fence_set_error(&w->dma, err);
> +               goto out_signal;
> +       }
> +
> +       rq = i915_request_create(w->ce);
> +       if (IS_ERR(rq)) {
> +               err = PTR_ERR(rq);
> +               goto out_unpin;
> +       }
> +
> +       err = intel_emit_vma_fill_blt(rq, vma, w->value);
> +       if (unlikely(err))
> +               goto out_request;
> +
> +       err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
> +out_request:
> +       if (unlikely(err))
> +               i915_request_skip(rq, err);
> +       else
> +               i915_request_get(rq);
> +
> +       i915_request_add(rq);
> +out_unpin:
> +       i915_vma_unpin(vma);
> +
> +       mutex_unlock(&i915->drm.struct_mutex);
> +
> +       if (!err) {
> +               err = dma_fence_add_callback(&rq->fence, &w->cb,
> +                                            clear_pages_dma_fence_cb);
> +               i915_request_put(rq);
> +               if (!err)
> +                       return;

This should be rearranged such that after we have a rq allocated, we
always attach the callback and propagate via the callback. Even on the
error path. That should tidy this up quite a bit. (It's pretty much the
point of why we always i915_request_add even when skipping, we always
have an intact timeline for conveying errors.)

> +       } else {
> +               dma_fence_set_error(&w->dma, err);
> +       }
> +out_signal:
> +       dma_fence_signal(&w->dma);
> +       dma_fence_put(&w->dma);
> +}
> +
> +static int __i915_sw_fence_call
> +clear_pages_work_notify(struct i915_sw_fence *fence,
> +                       enum i915_sw_fence_notify state)
> +{
> +       struct clear_pages_work *w = container_of(fence, typeof(*w), wait);
> +
> +       switch (state) {
> +       case FENCE_COMPLETE:
> +               schedule_work(&w->work);
> +               break;
> +
> +       case FENCE_FREE:
> +               dma_fence_put(&w->dma);
> +               break;
> +       }
> +
> +       return NOTIFY_DONE;
> +}
> +
> +static DEFINE_SPINLOCK(fence_lock);
> +
> +int i915_gem_schedule_fill_pages_blt(struct drm_i915_gem_object *obj,

Not sold on this name. Scheduling is inherent in the name GEM, and this
takes the i915_gem_object as its primary argument. I'd favour
i915_gem_object_fill_blt, though it's really part of the mman family. To
be resolved later.

> +                                    struct intel_context *ce,
> +                                    struct sg_table *pages,
> +                                    struct i915_page_sizes *page_sizes,
> +                                    u32 value)
> +{
> +       struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +       struct i915_gem_context *ctx = ce->gem_context;
> +       struct i915_address_space *vm;
> +       struct clear_pages_work *work;
> +       struct i915_sleeve *sleeve;
> +       int err;
> +
> +       vm = ctx->ppgtt ? &ctx->ppgtt->vm : &i915->ggtt.vm;

Remind me, this needs to be ce->vm.

> +       sleeve = create_sleeve(vm, obj, pages, page_sizes);
> +       if (IS_ERR(sleeve))
> +               return PTR_ERR(sleeve);
> +
> +       work = kmalloc(sizeof(*work), GFP_KERNEL);
> +       if (work == NULL) {
> +               destroy_sleeve(sleeve);
> +               return -ENOMEM;
> +       }
> +
> +       work->value = value;
> +       work->sleeve = sleeve;
> +       work->ce = ce;
> +
> +       INIT_WORK(&work->work, clear_pages_worker);
> +
> +       init_irq_work(&work->irq_work, clear_pages_signal_irq_worker);
> +
> +       dma_fence_init(&work->dma,
> +                      &clear_pages_work_ops,
> +                      &fence_lock,
> +                      i915->mm.unordered_timeline,
> +                      0);
> +       i915_sw_fence_init(&work->wait, clear_pages_work_notify);
> +
> +       i915_gem_object_lock(obj);
> +       err = i915_sw_fence_await_reservation(&work->wait,
> +                                             obj->resv, NULL,
> +                                             true, I915_FENCE_TIMEOUT,
> +                                             I915_FENCE_GFP);
> +       if (err < 0) {
> +               dma_fence_set_error(&work->dma, err);
> +       } else {
> +               reservation_object_add_excl_fence(obj->resv, &work->dma);
> +               err = 0;
> +       }
> +       i915_gem_object_unlock(obj);
> +
> +       dma_fence_get(&work->dma);
> +       i915_sw_fence_commit(&work->wait);
> +
> +       return err;
> +}
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftests/i915_gem_client_blt.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_gem_client_blt.h b/drivers/gpu/drm/i915/i915_gem_client_blt.h
> new file mode 100644
> index 000000000000..a7080623e741
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_gem_client_blt.h
> @@ -0,0 +1,21 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2019 Intel Corporation
> + */
> +#ifndef __I915_GEM_CLIENT_BLT_H__
> +#define __I915_GEM_CLIENT_BLT_H__
> +
> +#include <linux/types.h>
> +
> +struct drm_i915_gem_object;
> +struct intel_context ce;
> +struct i915_page_sizes;
> +struct sg_table;
> +
> +int i915_gem_schedule_fill_pages_blt(struct drm_i915_gem_object *obj,
> +                                    struct intel_context *ce,
> +                                    struct sg_table *pages,
> +                                    struct i915_page_sizes *page_sizes,
> +                                    u32 value);
> +
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_gem_object_blt.c b/drivers/gpu/drm/i915/i915_gem_object_blt.c
> new file mode 100644
> index 000000000000..3fda33e5dcf5
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_gem_object_blt.c
> @@ -0,0 +1,103 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include "i915_gem_object_blt.h"
> +
> +#include "i915_gem_clflush.h"
> +#include "intel_drv.h"
> +
> +int intel_emit_vma_fill_blt(struct i915_request *rq,
> +                           struct i915_vma *vma,
> +                           u32 value)
> +{
> +       struct intel_context *ce = rq->hw_context;
> +       u32 *cs;
> +       int err;
> +
> +       if (ce->engine->emit_init_breadcrumb) {
> +               err = ce->engine->emit_init_breadcrumb(rq);
> +               if (unlikely(err))
> +                       return err;
> +       }

Though it may push some duplication in the callers, this is the callers
duty.

> +       cs = intel_ring_begin(rq, 8);

if (IS_ERR(cs))
	return cs;
> +
> +       if (INTEL_GEN(rq->i915) >= 8) {
> +               *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7-2);
> +               *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
> +               *cs++ = 0;
> +               *cs++ = vma->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +               *cs++ = lower_32_bits(vma->node.start);
> +               *cs++ = upper_32_bits(vma->node.start);
> +               *cs++ = value;
> +               *cs++ = MI_NOOP;
> +       } else {
> +               *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6-2);
> +               *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
> +               *cs++ = 0;
> +               *cs++ = vma->size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +               *cs++ = vma->node.start;
> +               *cs++ = value;
> +               *cs++ = MI_NOOP;
> +               *cs++ = MI_NOOP;
> +       }
> +
> +       intel_ring_advance(rq, cs);
> +
> +       return 0;
> +}
> +
> +int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj,
> +                            struct intel_context *ce,
> +                            u32 value)
> +{
> +       struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +       struct i915_gem_context *ctx = ce->gem_context;
> +       struct i915_address_space *vm;
> +       struct i915_request *rq;
> +       struct i915_vma *vma;
> +       int err;
> +
> +       vm = ctx->ppgtt ? &ctx->ppgtt->vm : &i915->ggtt.vm;
> +
> +       vma = i915_vma_instance(obj, vm, NULL);
> +       if (IS_ERR(vma))
> +               return PTR_ERR(vma);
> +
> +       err = i915_vma_pin(vma, 0, 0, PIN_USER);
> +       if (unlikely(err))
> +               return err;
> +
> +       if (obj->cache_dirty)

if (obj->cache_dirty & ~obj->cache_coherent)

> +               i915_gem_clflush_object(obj, 0);
> +
> +       rq = i915_request_create(ce);
> +       if (IS_ERR(rq)) {
> +               err = PTR_ERR(rq);
> +               goto out_unpin;
> +       }
> +
> +       err = i915_request_await_object(rq, obj, true);
> +       if (unlikely(err))
> +               goto out_request;
> +
> +       err = intel_emit_vma_fill_blt(rq, vma, value);
> +       if (unlikely(err))
> +               goto out_request;
> +
> +       err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
> +out_request:
> +       if (unlikely(err))
> +               i915_request_skip(rq, err);
> +
> +       i915_request_add(rq);
> +out_unpin:
> +       i915_vma_unpin(vma);
> +       return err;

Ok.

> +}
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftests/i915_gem_object_blt.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_gem_object_blt.h b/drivers/gpu/drm/i915/i915_gem_object_blt.h
> new file mode 100644
> index 000000000000..7ec7de6ac0c0
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_gem_object_blt.h
> @@ -0,0 +1,24 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#ifndef __I915_GEM_OBJECT_BLT_H__
> +#define __I915_GEM_OBJECT_BLT_H__
> +
> +#include <linux/types.h>
> +
> +struct drm_i915_gem_object;
> +struct intel_context;
> +struct i915_request;
> +struct i915_vma;
> +
> +int intel_emit_vma_fill_blt(struct i915_request *rq,
> +                           struct i915_vma *vma,
> +                           u32 value);
> +
> +int i915_gem_object_fill_blt(struct drm_i915_gem_object *obj,
> +                            struct intel_context *ce,
> +                            u32 value);
> +
> +#endif
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_client_blt.c b/drivers/gpu/drm/i915/selftests/i915_gem_client_blt.c
> new file mode 100644
> index 000000000000..54b15d22e310
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_client_blt.c
> @@ -0,0 +1,131 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include "../i915_selftest.h"
> +
> +#include "igt_flush_test.h"
> +#include "mock_drm.h"
> +#include "mock_context.h"
> +
> +static int igt_client_fill(void *arg)
> +{
> +       struct intel_context *ce = arg;
> +       struct drm_i915_private *i915 = ce->gem_context->i915;
> +       struct drm_i915_gem_object *obj;
> +       struct rnd_state prng;
> +       IGT_TIMEOUT(end);
> +       u32 *vaddr;
> +       int err = 0;
> +
> +       prandom_seed_state(&prng, i915_selftest.random_seed);
> +
> +       do {
> +               u32 sz = prandom_u32_state(&prng) % SZ_32M;
> +               u32 val = prandom_u32_state(&prng);
> +               u32 i;
> +
> +               sz = round_up(sz, PAGE_SIZE);
> +
> +               pr_info("%s with sz=%x, val=%x\n", __func__, sz, val);

pr_debug ? Won't this be quite frequent?

> +               obj = i915_gem_object_create_internal(i915, sz);
> +               if (IS_ERR(obj)) {
> +                       err = PTR_ERR(obj);
> +                       goto err_flush;
> +               }
> +
> +               vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
> +               if (IS_ERR(vaddr)) {
> +                       err = PTR_ERR(vaddr);
> +                       goto err_put;
> +               }
> +
> +               /*
> +                * XXX: The goal is move this to get_pages, so try to dirty the
> +                * CPU cache first to check that we do the required clflush
> +                * before scheduling the blt for !llc platforms. This matches
> +                * some version of reality where at get_pages the pages
> +                * themselves may not yet be coherent with the GPU(swap-in). If
> +                * we are missing the flush then we should see the stale cache
> +                * values after we do the set_to_cpu_domain and pick it up as a
> +                * test failure.
> +                */
> +               memset32(vaddr, val ^ 0xdeadbeaf, obj->base.size / sizeof(u32));
> +
> +               if (!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE))
> +                       obj->cache_dirty = true;
> +
> +               err = i915_gem_schedule_fill_pages_blt(obj, ce, obj->mm.pages,
> +                                                      &obj->mm.page_sizes,
> +                                                      val);
> +               if (err)
> +                       goto err_unpin;
> +
> +               /*
> +                * XXX: For now do the wait without the BKL to ensure we don't
> +                * deadlock.
> +                */
> +               err = i915_gem_object_wait(obj,
> +                                          I915_WAIT_INTERRUPTIBLE |
> +                                          I915_WAIT_ALL,
> +                                          MAX_SCHEDULE_TIMEOUT);
> +               if (err)
> +                       goto err_unpin;
> +
> +               mutex_lock(&i915->drm.struct_mutex);
> +               err = i915_gem_object_set_to_cpu_domain(obj, false);
> +               mutex_unlock(&i915->drm.struct_mutex);
> +               if (err)
> +                       goto err_unpin;
> +
> +               for (i = 0; i < obj->base.size / sizeof(u32); ++i) {
> +                       if (vaddr[i] != val) {
> +                               pr_err("vaddr[%u]=%x, expected=%x\n", i,
> +                                      vaddr[i], val);
> +                               err = -EINVAL;
> +                               goto err_unpin;
> +                       }
> +               }
> +
> +               i915_gem_object_unpin_map(obj);
> +
> +               mutex_lock(&i915->drm.struct_mutex);
> +               __i915_gem_object_release_unless_active(obj);
> +               mutex_unlock(&i915->drm.struct_mutex);
> +       } while (!time_after(jiffies, end));
> +
> +       goto err_flush;
> +
> +err_unpin:
> +       i915_gem_object_unpin_map(obj);
> +err_put:
> +       mutex_lock(&i915->drm.struct_mutex);
> +       __i915_gem_object_release_unless_active(obj);
> +       mutex_unlock(&i915->drm.struct_mutex);
> +err_flush:
> +       mutex_lock(&i915->drm.struct_mutex);
> +       igt_flush_test(i915, I915_WAIT_LOCKED);

if (igt...)
	err = -EIO;

When it fails, we are wedged, so promote the result to an -EIO.

> +       mutex_unlock(&i915->drm.struct_mutex);
> +
> +       if (err == -ENOMEM)
> +               err = 0;
> +
> +       return err;
> +}

So much work to do to reduce lock coverage so that we can emit requests
from inside obj->mm.lock. This code is very much a WIP and not near
ready for actual use, but serves a very, very useful purpose in
providing a test bed for incremental improvements.
-Chris