[Intel-gfx] [RFC 5/9] drm/i915: Add per context timelines to fence object

Wed Oct 28 05:59:31 PDT 2015

Have finally had some time to come back to this and respond 
to/incorporate the comments made some while ago...

On 23/07/2015 14:50, Tvrtko Ursulin wrote:
> Hi,
>
> On 07/17/2015 03:31 PM, John.C.Harrison at Intel.com wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> The fence object used inside the request structure requires a 
>> sequence number.
>> Although this is not used by the i915 driver itself, it could 
>> potentially be
>> used by non-i915 code if the fence is passed outside of the driver. 
>> This is the
>> intention as it allows external kernel drivers and user applications 
>> to wait on
>> batch buffer completion asynchronously via the dma-buff fence API.
>>
>> To ensure that such external users are not confused by strange things 
>> happening
>> with the seqno, this patch adds in a per context timeline that can 
>> provide a
>> guaranteed in-order seqno value for the fence. This is safe because the
>> scheduler will not re-order batch buffers within a context - they are 
>> considered
>> to be mutually dependent.
>>
>> [new patch in series]
>>
>> For: VIZ-5190
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h         | 25 ++++++++----
>>   drivers/gpu/drm/i915/i915_gem.c         | 69 
>> ++++++++++++++++++++++++++++++---
>>   drivers/gpu/drm/i915/i915_gem_context.c | 15 ++++++-
>>   drivers/gpu/drm/i915/intel_lrc.c        |  8 ++++
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
>>   5 files changed, 103 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h 
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 0c7df46..88a4746 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -840,6 +840,15 @@ struct i915_ctx_hang_stats {
>>       bool banned;
>>   };
>>
>> +struct i915_fence_timeline {
>> +    unsigned    fence_context;
>> +    uint32_t    context;
>
> Unused field?
>
>> +    uint32_t    next;
>
> fence.h defines seqnos as 'unsigned', which matches this in practice, 
> but maybe it would be nicer to use the same type name.
>
>> +
>> +    struct intel_context *ctx;
>> +    struct intel_engine_cs *ring;
>> +};
>> +
>>   /* This must match up with the value previously used for 
>> execbuf2.rsvd1. */
>>   #define DEFAULT_CONTEXT_HANDLE 0
>>
>> @@ -885,6 +894,7 @@ struct intel_context {
>>           struct drm_i915_gem_object *state;
>>           struct intel_ringbuffer *ringbuf;
>>           int pin_count;
>> +        struct i915_fence_timeline fence_timeline;
>>       } engine[I915_NUM_RINGS];
>>
>>       struct list_head link;
>> @@ -2153,13 +2163,10 @@ void i915_gem_track_fb(struct 
>> drm_i915_gem_object *old,
>>   struct drm_i915_gem_request {
>>       /**
>>        * Underlying object for implementing the signal/wait stuff.
>> -     * NB: Never call fence_later() or return this fence object to user
>> -     * land! Due to lazy allocation, scheduler re-ordering, 
>> pre-emption,
>> -     * etc., there is no guarantee at all about the validity or
>> -     * sequentiality of the fence's seqno! It is also unsafe to let
>> -     * anything outside of the i915 driver get hold of the fence object
>> -     * as the clean up when decrementing the reference count requires
>> -     * holding the driver mutex lock.
>> +     * NB: Never return this fence object to user land! It is unsafe to
>> +     * let anything outside of the i915 driver get hold of the fence
>> +     * object as the clean up when decrementing the reference count
>> +     * requires holding the driver mutex lock.
>>        */
>>       struct fence fence;
>>
>> @@ -2239,6 +2246,10 @@ int i915_gem_request_alloc(struct 
>> intel_engine_cs *ring,
>>                  struct drm_i915_gem_request **req_out);
>>   void i915_gem_request_cancel(struct drm_i915_gem_request *req);
>>
>> +int i915_create_fence_timeline(struct drm_device *dev,
>> +                   struct intel_context *ctx,
>> +                   struct intel_engine_cs *ring);
>> +
>>   static inline bool i915_gem_request_completed(struct 
>> drm_i915_gem_request *req)
>>   {
>>       return fence_is_signaled(&req->fence);
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 3970250..af79716 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -2671,6 +2671,25 @@ static bool 
>> i915_gem_request_is_completed(struct fence *req_fence)
>>       return i915_seqno_passed(seqno, req->seqno);
>>   }
>>
>> +static void i915_fence_timeline_value_str(struct fence *fence, char 
>> *str, int size)
>> +{
>> +    struct drm_i915_gem_request *req;
>> +
>> +    req = container_of(fence, typeof(*req), fence);
>> +
>> +    /* Last signalled timeline value ??? */
>> +    snprintf(str, size, "? [%d]"/*, tl->value*/, 
>> req->ring->get_seqno(req->ring, true));
>> +}
>
> If timelines are per context now maybe we should update 
> i915_gem_request_get_timeline_name to be per context instead of per 
> engine as well? Like this we have a name space overlap / seqno 
> collisions from userspace point of view.
>
>> +static void i915_fence_value_str(struct fence *fence, char *str, int 
>> size)
>> +{
>> +    struct drm_i915_gem_request *req;
>> +
>> +    req = container_of(fence, typeof(*req), fence);
>> +
>> +    snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);
>> +}
>> +
>>   static const struct fence_ops i915_gem_request_fops = {
>>       .get_driver_name    = i915_gem_request_get_driver_name,
>>       .get_timeline_name    = i915_gem_request_get_timeline_name,
>> @@ -2678,8 +2697,48 @@ static const struct fence_ops 
>> i915_gem_request_fops = {
>>       .signaled        = i915_gem_request_is_completed,
>>       .wait            = fence_default_wait,
>>       .release        = i915_gem_request_free,
>> +    .fence_value_str    = i915_fence_value_str,
>> +    .timeline_value_str    = i915_fence_timeline_value_str,
>>   };
>>
>> +int i915_create_fence_timeline(struct drm_device *dev,
>> +                   struct intel_context *ctx,
>> +                   struct intel_engine_cs *ring)
>> +{
>> +    struct i915_fence_timeline *timeline;
>> +
>> +    timeline = &ctx->engine[ring->id].fence_timeline;
>> +
>> +    if (timeline->ring)
>> +        return 0;
>> +
>> +    timeline->fence_context = fence_context_alloc(1);
>> +
>> +    /*
>> +     * Start the timeline from seqno 0 as this is a special value
>> +     * that is reserved for invalid sync points.
>> +     */
>> +    timeline->next       = 1;
>> +    timeline->ctx        = ctx;
>> +    timeline->ring       = ring;
>> +
>> +    return 0;
>> +}
>> +
>> +static uint32_t i915_fence_timeline_get_next_seqno(struct 
>> i915_fence_timeline *timeline)
>> +{
>> +    uint32_t seqno;
>> +
>> +    seqno = timeline->next;
>> +
>> +    /* Reserve zero for invalid */
>> +    if (++timeline->next == 0 ) {
>> +        timeline->next = 1;
>> +    }
>> +
>> +    return seqno;
>> +}
>> +
>>   int i915_gem_request_alloc(struct intel_engine_cs *ring,
>>                  struct intel_context *ctx,
>>                  struct drm_i915_gem_request **req_out)
>> @@ -2715,7 +2774,9 @@ int i915_gem_request_alloc(struct 
>> intel_engine_cs *ring,
>>           goto err;
>>       }
>>
>> -    fence_init(&req->fence, &i915_gem_request_fops, 
>> &ring->fence_lock, ring->fence_context, req->seqno);
>> +    fence_init(&req->fence, &i915_gem_request_fops, &ring->fence_lock,
>> + ctx->engine[ring->id].fence_timeline.fence_context,
>> + 
>> i915_fence_timeline_get_next_seqno(&ctx->engine[ring->id].fence_timeline));
>
> I suppose for debugging it could be useful to add this new seqno in 
> i915_gem_request_info to have visibility at both sides. To map 
> userspace seqnos to driver state.
>
>>       /*
>>        * Reserve space in the ring buffer for all the commands 
>> required to
>> @@ -5065,7 +5126,7 @@ i915_gem_init_hw(struct drm_device *dev)
>>   {
>>       struct drm_i915_private *dev_priv = dev->dev_private;
>>       struct intel_engine_cs *ring;
>> -    int ret, i, j, fence_base;
>> +    int ret, i, j;
>>
>>       if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt())
>>           return -EIO;
>> @@ -5117,16 +5178,12 @@ i915_gem_init_hw(struct drm_device *dev)
>>               goto out;
>>       }
>>
>> -    fence_base = fence_context_alloc(I915_NUM_RINGS);
>> -
>>       /* Now it is safe to go back round and do everything else: */
>>       for_each_ring(ring, dev_priv, i) {
>>           struct drm_i915_gem_request *req;
>>
>>           WARN_ON(!ring->default_context);
>>
>> -        ring->fence_context = fence_base + i;
>> -
>>           ret = i915_gem_request_alloc(ring, ring->default_context, 
>> &req);
>>           if (ret) {
>>               i915_gem_cleanup_ringbuffer(dev);
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
>> b/drivers/gpu/drm/i915/i915_gem_context.c
>> index b77a8f7..7eb8694 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -242,7 +242,7 @@ i915_gem_create_context(struct drm_device *dev,
>>   {
>>       const bool is_global_default_ctx = file_priv == NULL;
>>       struct intel_context *ctx;
>> -    int ret = 0;
>> +    int i, ret = 0;
>>
>>       BUG_ON(!mutex_is_locked(&dev->struct_mutex));
>>
>> @@ -250,6 +250,19 @@ i915_gem_create_context(struct drm_device *dev,
>>       if (IS_ERR(ctx))
>>           return ctx;
>>
>> +    if (!i915.enable_execlists) {
>> +        struct intel_engine_cs *ring;
>> +
>> +        /* Create a per context timeline for fences */
>> +        for_each_ring(ring, to_i915(dev), i) {
>> +            ret = i915_create_fence_timeline(dev, ctx, ring);
>> +            if (ret) {
>> +                DRM_ERROR("Fence timeline creation failed for legacy 
>> %s: %p\n", ring->name, ctx);
>> +                goto err_destroy;
>> +            }
>> +        }
>> +    }
>> +
>>       if (is_global_default_ctx && ctx->legacy_hw_ctx.rcs_state) {
>>           /* We may need to do things with the shrinker which
>>            * require us to immediately switch back to the default
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
>> b/drivers/gpu/drm/i915/intel_lrc.c
>> index ee4aecd..8f255de 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -2376,6 +2376,14 @@ int intel_lr_context_deferred_create(struct 
>> intel_context *ctx,
>>           goto error;
>>       }
>>
>> +    /* Create a per context timeline for fences */
>> +    ret = i915_create_fence_timeline(dev, ctx, ring);
>> +    if (ret) {
>> +        DRM_ERROR("Fence timeline creation failed for ring %s, ctx 
>> %p\n",
>> +              ring->name, ctx);
>> +        goto error;
>> +    }
>> +
>
> We must be 100% sure userspace cannot provoke context creation failure 
> by accident or deliberately. Otherwise we would leak fence contexts 
> until overflow which would be bad.
>
> Perhaps matching fence_context_release for existing 
> fence_context_alloc should be added?

Note that there is no fence_context_release. The fence_context_alloc 
code is simply 'return static_count++;'. There is no overflow checking. 
There is no anti-re-use checking. When 4GB contexts have been allocated, 
the old ones will get re-allocated and if they are still in use then 
tough. It's a really bad API! On the other hand, the context is not 
actually used for anything. So it doesn't really matter.

>
> Regards,
>
> Tvrtko