[Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring submission mechanism

Mon Jun 23 15:09:37 CEST 2014

> -----Original Message-----
> From: Volkin, Bradley D
> Sent: Friday, June 20, 2014 10:01 PM
> To: Mateo Lozano, Oscar
> Cc: intel-gfx at lists.freedesktop.org
> Subject: Re: [Intel-gfx] [PATCH 26/53] drm/i915/bdw: New logical ring
> submission mechanism
> 
> On Fri, Jun 13, 2014 at 08:37:44AM -0700, oscar.mateo at intel.com wrote:
> > From: Oscar Mateo <oscar.mateo at intel.com>
> >
> > Well, new-ish: if all this code looks familiar, that's because it's a
> > clone of the existing submission mechanism (with some modifications
> > here and there to adapt it to LRCs and Execlists).
> >
> > And why did we do this? Execlists offer several advantages, like
> > control over when the GPU is done with a given workload, that can help
> > simplify the submission mechanism, no doubt, but I am interested in
> > getting Execlists to work first and foremost. As we are creating a
> > parallel submission mechanism (even if itñś just a clone), we can now
> > start improving it without the fear of breaking old gens.
> >
> > Signed-off-by: Oscar Mateo <oscar.mateo at intel.com>
> > ---
> >  drivers/gpu/drm/i915/intel_lrc.c | 214
> > +++++++++++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/i915/intel_lrc.h |  18 ++++
> >  2 files changed, 232 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 02fc3d0..89aed7a 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -86,6 +86,220 @@ bool intel_enable_execlists(struct drm_device
> *dev)
> >  	return HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev);  }
> >
> > +static inline struct intel_ringbuffer * logical_ringbuf_get(struct
> > +intel_engine_cs *ring, struct intel_context *ctx) {
> > +	return ctx->engine[ring->id].ringbuf; }
> > +
> > +void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,
> > +					   struct intel_context *ctx)
> > +{
> > +	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);
> > +
> > +	intel_logical_ring_advance(ringbuf);
> > +
> > +	if (intel_ring_stopped(ring))
> > +		return;
> > +
> > +	ring->submit_ctx(ring, ctx, ringbuf->tail); }
> > +
> > +static int logical_ring_alloc_seqno(struct intel_engine_cs *ring,
> > +				    struct intel_context *ctx)
> > +{
> > +	if (ring->outstanding_lazy_seqno)
> > +		return 0;
> > +
> > +	if (ring->preallocated_lazy_request == NULL) {
> > +		struct drm_i915_gem_request *request;
> > +
> > +		request = kmalloc(sizeof(*request), GFP_KERNEL);
> > +		if (request == NULL)
> > +			return -ENOMEM;
> > +
> > +		ring->preallocated_lazy_request = request;
> > +	}
> > +
> > +	return i915_gem_get_seqno(ring->dev, &ring-
> >outstanding_lazy_seqno);
> > +}
> > +
> > +static int logical_ring_wait_request(struct intel_engine_cs *ring,
> > +				     struct intel_ringbuffer *ringbuf,
> > +				     struct intel_context *ctx,
> > +				     int bytes)
> > +{
> > +	struct drm_i915_gem_request *request;
> > +	u32 seqno = 0;
> > +	int ret;
> > +
> > +	if (ringbuf->last_retired_head != -1) {
> > +		ringbuf->head = ringbuf->last_retired_head;
> > +		ringbuf->last_retired_head = -1;
> > +
> > +		ringbuf->space = intel_ring_space(ringbuf);
> > +		if (ringbuf->space >= bytes)
> > +			return 0;
> > +	}
> > +
> > +	list_for_each_entry(request, &ring->request_list, list) {
> > +		if (__intel_ring_space(request->tail, ringbuf->tail,
> > +				ringbuf->size) >= bytes) {
> > +			seqno = request->seqno;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (seqno == 0)
> > +		return -ENOSPC;
> > +
> > +	ret = i915_wait_seqno(ring, seqno);
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* TODO: make sure we update the right ringbuffer's
> last_retired_head
> > +	 * when retiring requests */
> > +	i915_gem_retire_requests_ring(ring);
> > +	ringbuf->head = ringbuf->last_retired_head;
> > +	ringbuf->last_retired_head = -1;
> > +
> > +	ringbuf->space = intel_ring_space(ringbuf);
> > +	return 0;
> > +}
> > +
> > +static int logical_ring_wait_for_space(struct intel_engine_cs *ring,
> > +						   struct intel_ringbuffer
> *ringbuf,
> > +						   struct intel_context *ctx,
> > +						   int bytes)
> > +{
> > +	struct drm_device *dev = ring->dev;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	unsigned long end;
> > +	int ret;
> > +
> > +	ret = logical_ring_wait_request(ring, ringbuf, ctx, bytes);
> > +	if (ret != -ENOSPC)
> > +		return ret;
> > +
> > +	/* Force the context submission in case we have been skipping it */
> > +	intel_logical_ring_advance_and_submit(ring, ctx);
> > +
> > +	/* With GEM the hangcheck timer should kick us out of the loop,
> > +	 * leaving it early runs the risk of corrupting GEM state (due
> > +	 * to running on almost untested codepaths). But on resume
> > +	 * timers don't work yet, so prevent a complete hang in that
> > +	 * case by choosing an insanely large timeout. */
> > +	end = jiffies + 60 * HZ;
> > +
> 
> In the legacy ringbuffer version, there are tracepoints around the do loop.
> Should we keep those? Or add lrc specific equivalents?
> 
> > +	do {
> > +		ringbuf->head = I915_READ_HEAD(ring);
> > +		ringbuf->space = intel_ring_space(ringbuf);
> > +		if (ringbuf->space >= bytes) {
> > +			ret = 0;
> > +			break;
> > +		}
> > +
> > +		if (!drm_core_check_feature(dev, DRIVER_MODESET) &&
> > +		    dev->primary->master) {
> > +			struct drm_i915_master_private *master_priv = dev-
> >primary->master->driver_priv;
> > +			if (master_priv->sarea_priv)
> > +				master_priv->sarea_priv->perf_boxes |=
> I915_BOX_WAIT;
> > +		}
> > +
> > +		msleep(1);
> > +
> > +		if (dev_priv->mm.interruptible && signal_pending(current)) {
> > +			ret = -ERESTARTSYS;
> > +			break;
> > +		}
> > +
> > +		ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> > +					   dev_priv->mm.interruptible);
> > +		if (ret)
> > +			break;
> > +
> > +		if (time_after(jiffies, end)) {
> > +			ret = -EBUSY;
> > +			break;
> > +		}
> > +	} while (1);
> > +
> > +	return ret;
> > +}
> > +
> > +static int logical_ring_wrap_buffer(struct intel_engine_cs *ring,
> > +						struct intel_ringbuffer
> *ringbuf,
> > +						struct intel_context *ctx)
> > +{
> > +	uint32_t __iomem *virt;
> > +	int rem = ringbuf->size - ringbuf->tail;
> > +
> > +	if (ringbuf->space < rem) {
> > +		int ret = logical_ring_wait_for_space(ring, ringbuf, ctx, rem);
> > +		if (ret)
> > +			return ret;
> > +	}
> > +
> > +	virt = ringbuf->virtual_start + ringbuf->tail;
> > +	rem /= 4;
> > +	while (rem--)
> > +		iowrite32(MI_NOOP, virt++);
> > +
> > +	ringbuf->tail = 0;
> > +	ringbuf->space = intel_ring_space(ringbuf);
> > +
> > +	return 0;
> > +}
> > +
> > +static int logical_ring_prepare(struct intel_engine_cs *ring,
> > +				struct intel_ringbuffer *ringbuf,
> > +				struct intel_context *ctx,
> > +				int bytes)
> > +{
> > +	int ret;
> > +
> > +	if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) {
> > +		ret = logical_ring_wrap_buffer(ring, ringbuf, ctx);
> > +		if (unlikely(ret))
> > +			return ret;
> > +	}
> > +
> > +	if (unlikely(ringbuf->space < bytes)) {
> > +		ret = logical_ring_wait_for_space(ring, ringbuf, ctx, bytes);
> > +		if (unlikely(ret))
> > +			return ret;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +int intel_logical_ring_begin(struct intel_engine_cs *ring,
> > +			     struct intel_context *ctx,
> > +			     int num_dwords)
> > +{
> > +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > +	struct intel_ringbuffer *ringbuf = logical_ringbuf_get(ring, ctx);
> > +	int ret;
> > +
> > +	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
> > +				   dev_priv->mm.interruptible);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = logical_ring_prepare(ring, ringbuf, ctx,
> > +			num_dwords * sizeof(uint32_t));
> > +	if (ret)
> > +		return ret;
> > +
> > +	/* Preallocate the olr before touching the ring */
> > +	ret = logical_ring_alloc_seqno(ring, ctx);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ringbuf->space -= num_dwords * sizeof(uint32_t);
> > +	return 0;
> > +}
> > +
> >  static int gen8_init_common_ring(struct intel_engine_cs *ring)  {
> >  	struct drm_device *dev = ring->dev;
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.h
> > b/drivers/gpu/drm/i915/intel_lrc.h
> > index 26b0949..686ebf5 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.h
> > +++ b/drivers/gpu/drm/i915/intel_lrc.h
> > @@ -5,6 +5,24 @@
> >  void intel_logical_ring_cleanup(struct intel_engine_cs *ring);  int
> > intel_logical_rings_init(struct drm_device *dev);
> >
> > +void intel_logical_ring_advance_and_submit(struct intel_engine_cs *ring,
> > +					   struct intel_context *ctx);
> > +
> > +static inline void intel_logical_ring_advance(struct intel_ringbuffer
> > +*ringbuf) {
> > +	ringbuf->tail &= ringbuf->size - 1;
> > +}
> > +
> > +static inline void intel_logical_ring_emit(struct intel_ringbuffer
> > +*ringbuf, u32 data) {
> > +	iowrite32(data, ringbuf->virtual_start + ringbuf->tail);
> > +	ringbuf->tail += 4;
> > +}
> > +
> > +int intel_logical_ring_begin(struct intel_engine_cs *ring,
> > +			     struct intel_context *ctx,
> > +			     int num_dwords);
> > +
> 
> I think all of these are only used in intel_lrc.c, so don't need to be in the
> header and could all be static. Right?
> 
> Brad

So far, yes, but that´s only because I artificially made intel_lrc.c self-contained, as Daniel requested. What if we need to execute commands from somewhere else, like in intel_gen7_queue_flip()?

And this takes me to another discussion: this logical ring vs legacy ring split is probably a good idea (time will tell), but we should provide a way of sending commands for execution without knowing if Execlists are enabled or not. In the early series that was easy because we reused the ring_begin, ring_emit & ring_advance functions, but this is not the case anymore. And without this, sooner or later somebody will break legacy or execlists (this already happened last week, when somebody here was implementing native sync without knowing about Execlists).

So, the questions is: how do you feel about a dev_priv.gt vfunc that takes a context, a ring, an array of DWORDS and a BB length and does the intel_(logical)_ring_begin/emit/advance based on i915.enable_execlists?

-- Oscar