[Intel-gfx] [PATCH 12/13] drm/i915: Add sync framework support to execbuff IOCTL

Mon Dec 14 03:46:22 PST 2015

On 11/12/2015 15:29, Tvrtko Ursulin wrote:
>
>
> On 11/12/15 13:12, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> Various projects desire a mechanism for managing dependencies between
>> work items asynchronously. This can also include work items across
>> complete different and independent systems. For example, an
>> application wants to retreive a frame from a video in device,
>> using it for rendering on a GPU then send it to the video out device
>> for display all without having to stall waiting for completion along
>> the way. The sync framework allows this. It encapsulates
>> synchronisation events in file descriptors. The application can
>> request a sync point for the completion of each piece of work. Drivers
>> should also take sync points in with each new work request and not
>> schedule the work to start until the sync has been signalled.
>>
>> This patch adds sync framework support to the exec buffer IOCTL. A
>> sync point can be passed in to stall execution of the batch buffer
>> until signalled. And a sync point can be returned after each batch
>> buffer submission which will be signalled upon that batch buffer's
>> completion.
>>
>> At present, the input sync point is simply waited on synchronously
>> inside the exec buffer IOCTL call. Once the GPU scheduler arrives,
>> this will be handled asynchronously inside the scheduler and the IOCTL
>> can return without having to wait.
>>
>> Note also that the scheduler will re-order the execution of batch
>> buffers, e.g. because a batch buffer is stalled on a sync point and
>> cannot be submitted yet but other, independent, batch buffers are
>> being presented to the driver. This means that the timeline within the
>> sync points returned cannot be global to the engine. Instead they must
>> be kept per context per engine (the scheduler may not re-order batches
>> within a context). Hence the timeline cannot be based on the existing
>> seqno values but must be a new implementation.
>>
>> This patch is a port of work by several people that has been pulled
>> across from Android. It has been updated several times across several
>> patches. Rather than attempt to port each individual patch, this
>> version is the finished product as a single patch. The various
>> contributors/authors along the way (in addition to myself) were:
>>    Satyanantha RamaGopal M <rama.gopal.m.satyanantha at intel.com>
>>    Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>    Michel Thierry <michel.thierry at intel.com>
>>    Arun Siluvery <arun.siluvery at linux.intel.com>
>>
>> v2: New patch in series.
>>
>> v3: Updated to use the new 'sync_fence_is_signaled' API rather than
>> having to know about the internal meaning of the 'fence::status' field
>> (which recently got inverted!) [work by Peter Lawthers].
>>
>> Updated after review comments by Daniel Vetter. Removed '#ifdef
>> CONFIG_SYNC' and add 'select SYNC' to the Kconfig instead. Moved the
>> fd installation of fences to the end of the execbuff call to in order
>> to remove the need to use 'sys_close' to clean up on failure.
>>
>> Updated after review comments by Tvrtko Ursulin. Remvoed the
>> 'fence_external' flag as redundant. Covnerted DRM_ERRORs to
>> DRM_DEBUGs. Changed one second wait to a wait forever when waiting on
>> incoming fences.
>>
>> v4: Re-instated missing return of fd to user land that somehow got
>> lost in the anti-sys_close() re-factor.
>>
>> For: VIZ-5190
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> Signed-off-by: Peter Lawthers <peter.lawthers at intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> Cc: Daniel Vetter <daniel at ffwll.ch>
>> ---
>>   drivers/gpu/drm/i915/Kconfig               |  3 +
>>   drivers/gpu/drm/i915/i915_drv.h            |  6 ++
>>   drivers/gpu/drm/i915/i915_gem.c            | 89 
>> +++++++++++++++++++++++++++-
>>   drivers/gpu/drm/i915/i915_gem_execbuffer.c | 95 
>> ++++++++++++++++++++++++++++--
>>   include/uapi/drm/i915_drm.h                | 16 ++++-
>>   5 files changed, 200 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>> index 1d96fe1..cb5d5b2 100644
>> --- a/drivers/gpu/drm/i915/Kconfig
>> +++ b/drivers/gpu/drm/i915/Kconfig
>> @@ -22,6 +22,9 @@ config DRM_I915
>>       select ACPI_VIDEO if ACPI
>>       select ACPI_BUTTON if ACPI
>>       select MMU_NOTIFIER
>
> select MMU_NOTIFIER is not upstream! :)
Oops. That's from a patch not being upstreamed. It is required for 
OCL2.0 which is something we are using to test the scheduler.

>
>> +    # ANDROID is required for SYNC
>> +    select ANDROID
>> +    select SYNC
>>       help
>>         Choose this option if you have a system that has "Intel Graphics
>>         Media Accelerator" or "HD Graphics" integrated graphics,
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index d013c6d..194bca0 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -2278,6 +2278,12 @@ void i915_gem_request_notify(struct 
>> intel_engine_cs *ring, bool fence_locked);
>>   int i915_create_fence_timeline(struct drm_device *dev,
>>                      struct intel_context *ctx,
>>                      struct intel_engine_cs *ring);
>> +struct sync_fence;
>> +int i915_create_sync_fence(struct drm_i915_gem_request *req,
>> +               struct sync_fence **sync_fence, int *fence_fd);
>> +void i915_install_sync_fence_fd(struct drm_i915_gem_request *req,
>> +                struct sync_fence *sync_fence, int fence_fd);
>> +bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct 
>> sync_fence *fence);
>>
>>   static inline bool i915_gem_request_completed(struct 
>> drm_i915_gem_request *req)
>>   {
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 4817015..279d79f 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -37,6 +37,7 @@
>>   #include <linux/swap.h>
>>   #include <linux/pci.h>
>>   #include <linux/dma-buf.h>
>> +#include <../drivers/android/sync.h>
>>
>>   #define RQ_BUG_ON(expr)
>>
>> @@ -2560,7 +2561,13 @@ void __i915_add_request(struct 
>> drm_i915_gem_request *request,
>>
>>       /*
>>        * Add the fence to the pending list before emitting the 
>> commands to
>> -     * generate a seqno notification interrupt.
>> +     * generate a seqno notification interrupt. This will also enable
>> +     * interrupts if 'signal_requested' has been set.
>> +     *
>> +     * For example, if an exported sync point has been requested for 
>> this
>> +     * request then it can be waited on without the driver's knowledge,
>> +     * i.e. without calling __i915_wait_request(). Thus interrupts must
>> +     * be enabled from the start rather than only on demand.
>>        */
>>       i915_gem_request_submit(request);
>>
>> @@ -2901,6 +2908,86 @@ static unsigned 
>> i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *t
>>       return seqno;
>>   }
>>
>> +int i915_create_sync_fence(struct drm_i915_gem_request *req,
>> +               struct sync_fence **sync_fence, int *fence_fd)
>> +{
>> +    char ring_name[] = "i915_ring0";
>> +    int fd;
>> +
>> +    fd = get_unused_fd_flags(O_CLOEXEC);
>> +    if (fd < 0) {
>> +        DRM_DEBUG("No available file descriptors!\n");
>> +        *fence_fd = -1;
>> +        return fd;
>> +    }
>> +
>> +    ring_name[9] += req->ring->id;
>
> I think this will possibly blow up if CONFIG_DEBUG_RODATA is set, 
> which is the case on most kernels.
>
> So I think you need to make a local copy with kstrdup and free it 
> after calling sync_fence_create_dma.
What will blow up? The ring_name local is a stack array not a pointer to 
the data segment. Did you miss the '[]'?

>
>> +    *sync_fence = sync_fence_create_dma(ring_name, &req->fence);
>> +    if (!*sync_fence) {
>> +        put_unused_fd(fd);
>> +        *fence_fd = -1;
>> +        return -ENOMEM;
>> +    }
>> +
>> +    *fence_fd = fd;
>> +
>> +    return 0;
>> +}
>> +
>> +void i915_install_sync_fence_fd(struct drm_i915_gem_request *req,
>> +                struct sync_fence *sync_fence, int fence_fd)
>> +{
>> +    sync_fence_install(sync_fence, fence_fd);
>> +
>> +    /*
>> +     * NB: The corresponding put happens automatically on file close
>> +     * from sync_fence_release() via the fops callback.
>> +     */
>> +    fence_get(&req->fence);
>> +
>> +    /*
>> +     * The sync framework adds a callback to the fence. The fence
>> +     * framework calls 'enable_signalling' when a callback is added.
>> +     * Thus this flag should have been set by now. If not then
>> +     * 'enable_signalling' must be called explicitly because exporting
>> +     * a fence to user land means it can be waited on asynchronously 
>> and
>> +     * thus must be signalled asynchronously.
>> +     */
>> +    WARN_ON(!req->signal_requested);
>> +}
>> +
>> +bool i915_safe_to_ignore_fence(struct intel_engine_cs *ring, struct 
>> sync_fence *sync_fence)
>> +{
>> +    struct fence *dma_fence;
>> +    struct drm_i915_gem_request *req;
>> +    int i;
>> +
>> +    if (sync_fence_is_signaled(sync_fence))
>> +        return true;
>> +
>> +    for(i = 0; i < sync_fence->num_fences; i++) {
>> +        dma_fence = sync_fence->cbs[i].sync_pt;
>> +
>> +        /* No need to worry about dead points: */
>> +        if (fence_is_signaled(dma_fence))
>> +            continue;
>> +
>> +        /* Can't ignore other people's points: */
>
> Maybe add "unsignaled" to qualify.
The test above filters out anything that is signalled (or errored). 
Stating that again on each subsequent test seems unnecessarily verbose.

>
>> +        if(dma_fence->ops != &i915_gem_request_fops)
>> +            return false;
>> +
>> +        req = container_of(dma_fence, typeof(*req), fence);
>> +
>> +        /* Can't ignore points on other rings: */
>> +        if (req->ring != ring)
>> +            return false;
>> +
>> +        /* Same ring means guaranteed to be in order so ignore it. */
>> +    }
>> +
>> +    return true;
>> +}
>> +
>>   int i915_gem_request_alloc(struct intel_engine_cs *ring,
>>                  struct intel_context *ctx,
>>                  struct drm_i915_gem_request **req_out)
>> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
>> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> index bfc4c17..5f629f8 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
>> @@ -26,6 +26,7 @@
>>    *
>>    */
>>
>> +#include <linux/syscalls.h>
>>   #include <drm/drmP.h>
>>   #include <drm/i915_drm.h>
>>   #include "i915_drv.h"
>> @@ -33,6 +34,7 @@
>>   #include "intel_drv.h"
>>   #include <linux/dma_remapping.h>
>>   #include <linux/uaccess.h>
>> +#include <../drivers/android/sync.h>
>>
>>   #define  __EXEC_OBJECT_HAS_PIN (1<<31)
>>   #define  __EXEC_OBJECT_HAS_FENCE (1<<30)
>> @@ -1322,6 +1324,38 @@ eb_get_batch(struct eb_vmas *eb)
>>       return vma->obj;
>>   }
>>
>> +static int i915_early_fence_wait(struct intel_engine_cs *ring, int 
>> fence_fd)
>> +{
>> +    struct sync_fence *fence;
>> +    int ret = 0;
>> +
>> +    if (fence_fd < 0) {
>> +        DRM_DEBUG("Invalid wait fence fd %d on ring %d\n", fence_fd,
>> +              (int) ring->id);
>> +        return 1;
>
> Suggest adding kerneldoc describing return values from this function.
>
> It wasn't immediately clear to me what one means.
To be honest, I think the one is left over from an earlier iteration of 
the code which did things slightly differently. It probably could just 
return zero or -error.

> But I am also not sure that invalid fd shouldn't be an outright error 
> instead of allowing execbuf to contiue.
Wasn't sure if it was possible for a fence to be invalidated behind the 
back of an application. And you don't want to reject rendering from one 
app just because another app went splat. I guess even if the underlying 
fence has been destroyed, the fd will still be private to the current 
app and hence valid. So maybe it should just return -EINVAL or some such 
of the fd itself is toast.


>
>> +    }
>> +
>> +    fence = sync_fence_fdget(fence_fd);
>> +    if (fence == NULL) {
>> +        DRM_DEBUG("Invalid wait fence %d on ring %d\n", fence_fd,
>> +              (int) ring->id);
>> +        return 1;
>> +    }
>> +
>> +    if (!sync_fence_is_signaled(fence)) {
>
> Minor comment, but i915_safe_to_ignore_fence checks this as well so 
> you could remove it here.
>
>> +        /*
>> +         * Wait forever for the fence to be signalled. This is safe
>> +         * because the the mutex lock has not yet been acquired and
>> +         * the wait is interruptible.
>> +         */
>> +        if (!i915_safe_to_ignore_fence(ring, fence))
>> +            ret = sync_fence_wait(fence, -1);
>> +    }
>> +
>> +    sync_fence_put(fence);
>> +    return ret;
>> +}
>> +
>>   static int
>>   i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>                  struct drm_file *file,
>> @@ -1341,6 +1375,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>       u32 dispatch_flags;
>>       int ret;
>>       bool need_relocs;
>> +    int fd_fence_complete = -1;
>> +    int fd_fence_wait = lower_32_bits(args->rsvd2);
>> +    struct sync_fence *sync_fence;
>> +
>> +    /*
>> +     * Make sure an broken fence handle is not returned no matter
>> +     * how early an error might be hit. Note that rsvd2 has to be
>> +     * saved away first because it is also an input parameter!
>> +     */
>
> Instead of the 2nd sentence maybe say something like "Note that we 
> have saved rsvd2 already for later use since it is also in input 
> parameter!". Like written I was expecting the code following the 
> comment to do that, and then was confused when it didn't. Or maybe my 
> attention span is too short.
Will update the comment for those who can't remember what they read two 
lines earlier...

>
>> +    if (args->flags & I915_EXEC_CREATE_FENCE)
>> +        args->rsvd2 = (__u64) -1;
>>
>>       if (!i915_gem_check_execbuffer(args))
>>           return -EINVAL;
>> @@ -1424,6 +1469,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>           dispatch_flags |= I915_DISPATCH_RS;
>>       }
>>
>> +    /*
>> +     * Without a GPU scheduler, any fence waits must be done up front.
>> +     */
>> +    if (args->flags & I915_EXEC_WAIT_FENCE) {
>> +        ret = i915_early_fence_wait(ring, fd_fence_wait);
>> +        if (ret < 0)
>> +            return ret;
>> +
>> +        args->flags &= ~I915_EXEC_WAIT_FENCE;
>> +    }
>> +
>>       intel_runtime_pm_get(dev_priv);
>>
>>       ret = i915_mutex_lock_interruptible(dev);
>> @@ -1571,8 +1627,41 @@ i915_gem_do_execbuffer(struct drm_device *dev, 
>> void *data,
>>       params->batch_obj               = batch_obj;
>>       params->ctx                     = ctx;
>>
>> +    if (args->flags & I915_EXEC_CREATE_FENCE) {
>> +        /*
>> +         * Caller has requested a sync fence.
>> +         * User interrupts will be enabled to make sure that
>> +         * the timeline is signalled on completion.
>> +         */
>
> Is it signaled or signalled? There is a lot of usage of both 
> throughout the patches and I as a non-native speaker am 
> amu^H^H^Hconfused. ;)
It depends which side of the Atlantic you are on. English has a double 
l, American just a single. So largely it depends who wrote the 
code/comment and who (if anyone!) has reviewed it.

>
>> +        ret = i915_create_sync_fence(params->request, &sync_fence,
>> +                         &fd_fence_complete);
>> +        if (ret) {
>> +            DRM_ERROR("Fence creation failed for ring %d, ctx %p\n",
>> +                  ring->id, ctx);
>> +            goto err_batch_unpin;
>> +        }
>> +    }
>> +
>>       ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas);
>>
>> +    if (fd_fence_complete != -1) {
>> +        if (ret) {
>> +            sync_fence_put(sync_fence);
>> +            put_unused_fd(fd_fence_complete);
>> +        } else {
>> +            /*
>> +             * Install the fence into the pre-allocated file
>> +             * descriptor to the fence object so that user land
>> +             * can wait on it...
>> +             */
>> +            i915_install_sync_fence_fd(params->request,
>> +                           sync_fence, fd_fence_complete);
>> +
>> +            /* Return the fence through the rsvd2 field */
>> +            args->rsvd2 = (__u64) fd_fence_complete;
>> +        }
>> +    }
>> +
>>   err_batch_unpin:
>>       /*
>>        * FIXME: We crucially rely upon the active tracking for the 
>> (ppgtt)
>> @@ -1602,6 +1691,7 @@ pre_mutex_err:
>>       /* intel_gpu_busy should also get a ref, so it will free when 
>> the device
>>        * is really idle. */
>>       intel_runtime_pm_put(dev_priv);
>> +
>>       return ret;
>>   }
>>
>> @@ -1707,11 +1797,6 @@ i915_gem_execbuffer2(struct drm_device *dev, 
>> void *data,
>>           return -EINVAL;
>>       }
>>
>> -    if (args->rsvd2 != 0) {
>> -        DRM_DEBUG("dirty rvsd2 field\n");
>> -        return -EINVAL;
>> -    }
>> -
>>       exec2_list = kmalloc(sizeof(*exec2_list)*args->buffer_count,
>>                    GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
>>       if (exec2_list == NULL)
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index 67cebe6..86f7921 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -250,7 +250,7 @@ typedef struct _drm_i915_sarea {
>>   #define DRM_IOCTL_I915_HWS_ADDR DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_HWS_ADDR, struct drm_i915_gem_init)
>>   #define DRM_IOCTL_I915_GEM_INIT DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_INIT, struct drm_i915_gem_init)
>>   #define DRM_IOCTL_I915_GEM_EXECBUFFER DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
>> -#define DRM_IOCTL_I915_GEM_EXECBUFFER2 DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
>> +#define DRM_IOCTL_I915_GEM_EXECBUFFER2 DRM_IOWR(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
>>   #define DRM_IOCTL_I915_GEM_PIN DRM_IOWR(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
>>   #define DRM_IOCTL_I915_GEM_UNPIN    DRM_IOW(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
>>   #define DRM_IOCTL_I915_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + 
>> DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
>> @@ -695,7 +695,7 @@ struct drm_i915_gem_exec_object2 {
>>       __u64 flags;
>>
>>       __u64 rsvd1;
>> -    __u64 rsvd2;
>> +    __u64 rsvd2;    /* Used for fence fd */
>>   };
>>
>>   struct drm_i915_gem_execbuffer2 {
>> @@ -776,7 +776,17 @@ struct drm_i915_gem_execbuffer2 {
>>    */
>>   #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
>>
>> -#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
>> +/** Caller supplies a sync fence fd in the rsvd2 field.
>> + * Wait for it to be signalled before starting the work
>> + */
>> +#define I915_EXEC_WAIT_FENCE        (1<<16)
>> +
>> +/** Caller wants a sync fence fd for this execbuffer.
>> + *  It will be returned in rsvd2
>> + */
>> +#define I915_EXEC_CREATE_FENCE        (1<<17)
>> +
>> +#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_CREATE_FENCE<<1)
>>
>>   #define I915_EXEC_CONTEXT_ID_MASK    (0xffffffff)
>>   #define i915_execbuffer2_set_context_id(eb2, context) \
>>
>
> Regards,
>
> Tvrtko
>