[Intel-gfx] [PATCH 06/10] drm/i915: Add extra add_request calls

Wed Dec 10 02:55:44 PST 2014

On Tue, Dec 09, 2014 at 12:59:09PM +0000, John.C.Harrison at Intel.com wrote:
> From: John Harrison <John.C.Harrison at Intel.com>
> 
> The scheduler needs to track batch buffers by request without extra, non-batch
> buffer work being attached to the same request. This means that anywhere which
> adds work to the ring should explicitly call i915_add_request() when it has
> finished writing to the ring.
> 
> The add_request() function does extra work, such as flushing caches, that does
> not necessarily want to be done everywhere. Instead, a new
> i915_add_request_wo_flush() function has been added which skips the cache flush
> and just tidies up the request structure.
> 
> Note, much of this patch was implemented by Naresh Kumar Kachhi for pending
> power management improvements. However, it is also directly applicable to the
> scheduler work as noted above.
> 
> Change-Id: I66a6861118ee8e7ad7ca6c80c71a3b256db92e18
> For: VIZ-1587
> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>

For easier review I think it'd be better to split this into the prep patch
to rework add_request and then patches to roll it out. Some of the places
you're adding an add_requst too look questionable to me. More comments
below.

After reading through the patch (see below for comments) I think there's
only two places where we need to add requests:
- at the end of pageflips, unconditionally (to catch any semaphores
  already emitted if something fails)
- after ring init in init_hw (but perhaps with some of the restructuring
  in-flight applied to make it less messy).

Then a WARN_ON in gpu_idle to make sure no request ever escaped. Should
all be separate patches I think (so that we have a nice empty space in the
commit message for explanation why this works).

This patch here just sprinkles lots of add_requests all over which ime
tends to blow up sooner or later since it's much harder to understand and
leaves no room for selfchecks in the code to catch bugs.

Cheers, Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h              |    9 ++++--
>  drivers/gpu/drm/i915/i915_gem.c              |   45 ++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_gem_context.c      |    9 ++++++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c   |    4 +--
>  drivers/gpu/drm/i915/i915_gem_gtt.c          |    9 ++++++
>  drivers/gpu/drm/i915/i915_gem_render_state.c |    2 +-
>  drivers/gpu/drm/i915/intel_display.c         |   23 +++++++++----
>  drivers/gpu/drm/i915/intel_lrc.c             |    4 +--
>  8 files changed, 73 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 11e85cb..0e280c4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2553,7 +2553,7 @@ static inline void i915_gem_object_unpin_pages(struct drm_i915_gem_object *obj)
>  
>  int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
>  int i915_gem_object_sync(struct drm_i915_gem_object *obj,
> -			 struct intel_engine_cs *to);
> +			 struct intel_engine_cs *to, bool add_request);
>  void i915_vma_move_to_active(struct i915_vma *vma,
>  			     struct intel_engine_cs *ring);
>  int i915_gem_dumb_create(struct drm_file *file_priv,
> @@ -2641,9 +2641,12 @@ int __must_check i915_gpu_idle(struct drm_device *dev);
>  int __must_check i915_gem_suspend(struct drm_device *dev);
>  int __i915_add_request(struct intel_engine_cs *ring,
>  		       struct drm_file *file,
> -		       struct drm_i915_gem_object *batch_obj);
> +		       struct drm_i915_gem_object *batch_obj,
> +		       bool flush_caches);
>  #define i915_add_request(ring) \
> -	__i915_add_request(ring, NULL, NULL)
> +	__i915_add_request(ring, NULL, NULL, true)
> +#define i915_add_request_no_flush(ring) \
> +	__i915_add_request(ring, NULL, NULL, false)
>  int __i915_wait_request(struct drm_i915_gem_request *req,
>  			unsigned reset_counter,
>  			bool interruptible,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index de241eb..b022a2d 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2419,7 +2419,8 @@ i915_gem_get_seqno(struct drm_device *dev, u32 *seqno)
>  
>  int __i915_add_request(struct intel_engine_cs *ring,
>  		       struct drm_file *file,
> -		       struct drm_i915_gem_object *obj)
> +		       struct drm_i915_gem_object *obj,
> +		       bool flush_caches)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	struct drm_i915_gem_request *request;
> @@ -2445,12 +2446,11 @@ int __i915_add_request(struct intel_engine_cs *ring,
>  	 * is that the flush _must_ happen before the next request, no matter
>  	 * what.
>  	 */
> -	if (i915.enable_execlists) {
> -		ret = logical_ring_flush_all_caches(ringbuf);
> -		if (ret)
> -			return ret;
> -	} else {
> -		ret = intel_ring_flush_all_caches(ring);
> +	if (flush_caches) {
> +		if (i915.enable_execlists)
> +			ret = logical_ring_flush_all_caches(ringbuf);
> +		else
> +			ret = intel_ring_flush_all_caches(ring);
>  		if (ret)
>  			return ret;
>  	}
> @@ -2462,15 +2462,12 @@ int __i915_add_request(struct intel_engine_cs *ring,
>  	 */
>  	request_ring_position = intel_ring_get_tail(ringbuf);
>  
> -	if (i915.enable_execlists) {
> +	if (i915.enable_execlists)
>  		ret = ring->emit_request(ringbuf);
> -		if (ret)
> -			return ret;
> -	} else {
> +	else
>  		ret = ring->add_request(ring);
> -		if (ret)
> -			return ret;
> -	}
> +	if (ret)
> +		return ret;
>  
>  	request->head = request_start;
>  	request->tail = request_ring_position;
> @@ -2974,6 +2971,8 @@ out:
>   *
>   * @obj: object which may be in use on another ring.
>   * @to: ring we wish to use the object on. May be NULL.
> + * @add_request: do we need to add a request to track operations
> + *    submitted on ring with sync_to function
>   *
>   * This code is meant to abstract object synchronization with the GPU.
>   * Calling with NULL implies synchronizing the object with the CPU
> @@ -2983,7 +2982,7 @@ out:
>   */
>  int
>  i915_gem_object_sync(struct drm_i915_gem_object *obj,
> -		     struct intel_engine_cs *to)
> +		     struct intel_engine_cs *to, bool add_request)
>  {
>  	struct intel_engine_cs *from;
>  	u32 seqno;
> @@ -3011,13 +3010,16 @@ i915_gem_object_sync(struct drm_i915_gem_object *obj,
>  
>  	trace_i915_gem_ring_sync_to(from, to, obj->last_read_req);
>  	ret = to->semaphore.sync_to(to, from, seqno);
> -	if (!ret)
> +	if (!ret) {
>  		/* We use last_read_req because sync_to()
>  		 * might have just caused seqno wrap under
>  		 * the radar.
>  		 */
>  		from->semaphore.sync_seqno[idx] =
>  				i915_gem_request_get_seqno(obj->last_read_req);
> +		if (add_request)
> +			i915_add_request_no_flush(to);
> +	}

Is the scheduler really still using hw semaphores for inter-ring sync?
I've thought all the depency tracking gets pushed into software? And even
then I expect that we'll insert semaphores really late and should treat
them as part of the subsequent batch of pageflip.

>  
>  	return ret;
>  }
> @@ -3126,6 +3128,15 @@ int i915_gpu_idle(struct drm_device *dev)
>  				return ret;
>  		}
>  
> +		/* Make sure the context switch (if one actually happened)
> +		 * gets wrapped up and finished rather than hanging around
> +		 * and confusing things later. */
> +		if (ring->outstanding_lazy_request) {
> +			ret = i915_add_request(ring);
> +			if (ret)
> +				return ret;
> +		}

I expect bad interactions with the ilk iommu w/a (see do_idling in
i915_gem_gtt.c) due to the recursion this will cause in the execbuf code.

Except for this case we shouldn't ever leak a request with your code
anywhere any more, so I think instead this should be a WARN_ON (ofc only
when do_idle_maps is false).

> +
>  		ret = intel_ring_idle(ring);
>  		if (ret)
>  			return ret;
> @@ -3946,7 +3957,7 @@ i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
>  	int ret;
>  
>  	if (pipelined != i915_gem_request_get_ring(obj->last_read_req)) {
> -		ret = i915_gem_object_sync(obj, pipelined);
> +		ret = i915_gem_object_sync(obj, pipelined, true);

Async flips should be caught by the flip requests added below. Essentially
same comment as above with object_sync.

>  		if (ret)
>  			return ret;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 5cd2b97..b2f1296 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -426,6 +426,15 @@ int i915_gem_context_enable(struct drm_i915_private *dev_priv)
>  			ret = i915_switch_context(ring, ring->default_context);
>  			if (ret)
>  				return ret;
> +
> +			/* Make sure the context switch (if one actually happened)
> +			 * gets wrapped up and finished rather than hanging around
> +			 * and confusing things later. */
> +			if (ring->outstanding_lazy_request) {
> +				ret = i915_add_request_no_flush(ring);

We have a lot more randome stuff all over when intializing rings (pde
loads, l3c remapping, context, golden context, w/a). Imo we should have
one central add request for all of them. We've the recently reworked ring
init plus some of the patches from Chris (to preload the ring before
enabling/submitting the ring) that should be fairly straighforward to
implement.

> +				if (ret)
> +					return ret;
> +			}
>  		}
>  
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index aa4566e..1268e89 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -832,7 +832,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
>  
>  	list_for_each_entry(vma, vmas, exec_list) {
>  		struct drm_i915_gem_object *obj = vma->obj;
> -		ret = i915_gem_object_sync(obj, ring);
> +		ret = i915_gem_object_sync(obj, ring, false);
>  		if (ret)
>  			return ret;
>  
> @@ -993,7 +993,7 @@ i915_gem_execbuffer_retire_commands(struct drm_device *dev,
>  	ring->gpu_caches_dirty = true;
>  
>  	/* Add a breadcrumb for the completion of the batch buffer */
> -	(void)__i915_add_request(ring, file, obj);
> +	(void)__i915_add_request(ring, file, obj, true);
>  }
>  
>  static int
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index ac03a38..7eead93 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1153,6 +1153,15 @@ int i915_ppgtt_init_hw(struct drm_device *dev)
>  			ret = ppgtt->switch_mm(ppgtt, ring);
>  			if (ret != 0)
>  				return ret;
> +
> +			/* Make sure the context switch (if one actually happened)
> +			 * gets wrapped up and finished rather than hanging around
> +			 * and confusing things later. */
> +			if (ring->outstanding_lazy_request) {
> +				ret = i915_add_request_no_flush(ring);
> +				if (ret)
> +					return ret;
> +			}
>  		}
>  	}
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_render_state.c b/drivers/gpu/drm/i915/i915_gem_render_state.c
> index 521548a..aba39c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_render_state.c
> +++ b/drivers/gpu/drm/i915/i915_gem_render_state.c
> @@ -173,7 +173,7 @@ int i915_gem_render_state_init(struct intel_engine_cs *ring)
>  
>  	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring);
>  
> -	ret = __i915_add_request(ring, NULL, so.obj);
> +	ret = __i915_add_request(ring, NULL, so.obj, true);
>  	/* __i915_add_request moves object to inactive if it fails */
>  out:
>  	i915_gem_render_state_fini(&so);
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index a9bdc12..5b8d4f9 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9116,7 +9116,7 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, 0); /* aux display base address, unused */
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	__intel_ring_advance(ring);
> +	i915_add_request_no_flush(ring);
>  	return 0;
>  }
>  
> @@ -9148,7 +9148,7 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, MI_NOOP);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	__intel_ring_advance(ring);
> +	i915_add_request_no_flush(ring);
>  	return 0;
>  }
>  
> @@ -9187,7 +9187,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, pf | pipesrc);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	__intel_ring_advance(ring);
> +	i915_add_request_no_flush(ring);
>  	return 0;
>  }
>  
> @@ -9223,7 +9223,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, pf | pipesrc);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	__intel_ring_advance(ring);
> +	i915_add_request_no_flush(ring);
>  	return 0;
>  }
>  
> @@ -9318,7 +9318,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, (MI_NOOP));
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	__intel_ring_advance(ring);
> +	i915_add_request_no_flush(ring);
>  	return 0;
>  }
>  
> @@ -9528,7 +9528,7 @@ static int intel_gen9_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, intel_crtc->unpin_work->gtt_offset);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	__intel_ring_advance(ring);
> +	i915_add_request_no_flush(ring);
>  
>  	return 0;
>  }
> @@ -9734,8 +9734,17 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  		if (ret)
>  			goto cleanup_unpin;
>  
> +		/* Borked: need to get the seqno for the request submitted in
> +		 * 'queue_flip()' above. However, either the request has been
> +		 * posted already and the seqno is gone (q_f calls add_request),
> +		 * or the request never gets posted and is merged into whatever
> +		 * render comes along next (q_f calls ring_advance).
> +		 *
> +		 * On the other hand, seqnos are going away soon anyway! So
> +		 * hopefully the problem will disappear...
> +		 */
>  		i915_gem_request_assign(&work->flip_queued_req,
> -					intel_ring_get_request(ring));
> +					ring->outstanding_lazy_request ? intel_ring_get_request(ring) : NULL);
>  	}
>  
>  	work->flip_queued_vblank = drm_vblank_count(dev, intel_crtc->pipe);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 82a47e3..643a56a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -617,7 +617,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
>  	list_for_each_entry(vma, vmas, exec_list) {
>  		struct drm_i915_gem_object *obj = vma->obj;
>  
> -		ret = i915_gem_object_sync(obj, ring);
> +		ret = i915_gem_object_sync(obj, ring, true);
>  		if (ret)
>  			return ret;
>  
> @@ -1630,7 +1630,7 @@ int intel_lr_context_render_state_init(struct intel_engine_cs *ring,
>  
>  	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring);
>  
> -	ret = __i915_add_request(ring, file, so.obj);
> +	ret = __i915_add_request(ring, file, so.obj, true);
>  	/* intel_logical_ring_add_request moves object to inactive if it
>  	 * fails */
>  out:
> -- 
> 1.7.9.5
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch