[Intel-gfx] [PATCH] drm/i915: Hide one invalid cancellation bug in i915_switch_context()

Mon Dec 21 04:59:01 PST 2015

On Mon, Dec 21, 2015 at 01:28:17PM +0100, Daniel Vetter wrote:
> On Thu, Dec 17, 2015 at 06:18:05PM +0000, Chris Wilson wrote:
> > As we add the VMA to the request early, it may be cancelled during
> > execbuf reservation. This will leave the context object pointing to a
> > dangling request; i915_wait_request() simply skips the wait and so we
> > may unbind the object whilst it is still active.
> > 
> > We can partially prevent such atrocity by doing the RCS context
> > initialisation earlier. This ensures that one callsite from blowing up
> > (and for igt this is a frequent culprit due to how the stressful batches
> > are submitted).
> > 
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
> > Cc: stable at vger.kernel.org
> > ---
> >  drivers/gpu/drm/i915/i915_gem_context.c | 29 +++++++++++++++++------------
> >  1 file changed, 17 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 900ffd044db8..657686e6492f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -657,7 +657,6 @@ static int do_switch(struct drm_i915_gem_request *req)
> >  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >  	struct intel_context *from = ring->last_context;
> >  	u32 hw_flags = 0;
> > -	bool uninitialized = false;
> >  	int ret, i;
> >  
> >  	if (from != NULL && ring == &dev_priv->ring[RCS]) {
> > @@ -764,6 +763,15 @@ static int do_switch(struct drm_i915_gem_request *req)
> >  			to->remap_slice &= ~(1<<i);
> >  	}
> >  
> > +	if (!to->legacy_hw_ctx.initialized) {
> > +		if (ring->init_context) {
> > +			ret = ring->init_context(req);
> > +			if (ret)
> > +				goto unpin_out;
> > +		}
> > +		to->legacy_hw_ctx.initialized = true;
> > +	}
> > +
> >  	/* The backing object for the context is done after switching to the
> >  	 * *next* context. Therefore we cannot retire the previous context until
> >  	 * the next context has already started running. In fact, the below code
> > @@ -772,6 +780,14 @@ static int do_switch(struct drm_i915_gem_request *req)
> >  	 */
> >  	if (from != NULL) {
> >  		from->legacy_hw_ctx.rcs_state->base.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> > +		/* XXX Note very well this is dangerous!
> > +		 * We are pinning this object using this request as our
> > +		 * active reference. However, this request may yet be cancelled
> > +		 * during the execbuf dispatch, leaving us waiting on a
> > +		 * dangling request. Waiting upon this dangling request is
> > +		 * ignored, which means that we may unbind the context whilst
> > +		 * the GPU is still writing to the backing storage.
> > +		 */
> >  		i915_vma_move_to_active(i915_gem_obj_to_ggtt(from->legacy_hw_ctx.rcs_state), req);
> 
> Hm, since this is just a partial fix, what about instead marking any
> request that has been put to use already in move_to_active. And then when
> we try to cancel it from execbuf notice that and not cancel it? Fixes both
> bugs and makes the entire thing a bit more robust since it allows us to
> throw stuff at a request without ordering constraints.

Hmm, it leaks a bit furter than execbuffer. For example, we need to
submit the request to maintain our state tracking correctly for the
semaphores and whatnot for incomplete CS flips.

>From inspection, we need:

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 730a6d2f5163..c33dd6f59064 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2708,12 +2708,9 @@ int i915_gpu_idle(struct drm_device *dev)
                                return PTR_ERR(req);
 
                        ret = i915_switch_context(req);
-                       if (ret) {
-                               i915_gem_request_cancel(req);
-                               return ret;
-                       }
-
                        i915_add_request_no_flush(req);
+                       if (ret)
+                               return ret;
                }
 
                ret = intel_engine_idle(ring);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index be2302f8a0cf..2496dc0948e1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1306,7 +1306,6 @@ execbuf_submit(struct i915_execbuffer_params *params,
        trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
 
        i915_gem_execbuffer_move_to_active(vmas, params->request);
-       i915_gem_execbuffer_retire_commands(params);
 
        return 0;
 }
@@ -1603,8 +1602,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
        }
 
        ret = i915_gem_request_add_to_client(params->request, file);
-       if (ret)
-               goto err_request;
+       if (ret) {
+               i915_gem_request_cancel(params->request);
+               goto err_batch_unpin;
+       }
 
        /*
         * Save assorted stuff away to pass through to *_submission().
@@ -1620,15 +1621,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
        params->ctx                     = ctx;
 
        ret = execbuf_submit(params, args, &eb->vmas);
-
-err_request:
-       /*
-        * If the request was created but not successfully submitted then it
-        * must be freed again. If it was submitted then it is being tracked
-        * on the active request list and no clean up is required here.
-        */
-       if (ret)
-               i915_gem_request_cancel(params->request);
+       i915_gem_execbuffer_retire_commands(params);
 
 err_batch_unpin:
        /*
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index f73c06387ac3..c01765dea0c9 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11671,7 +11671,7 @@ cleanup_unpin:
        intel_unpin_fb_obj(fb, crtc->primary->state);
 cleanup_request:
        if (request)
-               i915_gem_request_cancel(request);
+               i915_add_request_no_flush(request);
 cleanup_pending:
        atomic_dec(&intel_crtc->unpin_work_count);
        mutex_unlock(&dev->struct_mutex);

And then everytime we need to consider, can I actually cancel this request?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre