[Intel-gfx] [PATCH v3] drm/i915: Replaced Blitter ring based flips with MMIO flips for VLV

Fri May 9 13:59:42 CEST 2014

On Sun, Mar 23, 2014 at 02:31:05PM +0530, sourab.gupta at intel.com wrote:
> From: Sourab Gupta <sourab.gupta at intel.com>
> 
> Using MMIO based flips on VLV for Media power well residency optimization.
> The blitter ring is currently being used just for command streamer based
> flip calls. For pure 3D workloads, with MMIO flips, there will be no use
> of blitter ring and this will ensure the 100% residency for Media well.

Sorry for dragging my feet with reviewing this. I'm hoping this is the
latest version...

> 
> v2: The MMIO flips now use the interrupt driven mechanism for issuing the
> flips when target seqno is reached. (Incorporating Ville's idea)
> 
> v3: Rebasing on latest code. Code restructuring after incorporating
> Damien's comments
> 
> Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
> Signed-off-by: Akash Goel <akash.goel at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c      |   1 +
>  drivers/gpu/drm/i915/i915_drv.h      |   7 ++
>  drivers/gpu/drm/i915/i915_irq.c      |   2 +
>  drivers/gpu/drm/i915/intel_display.c | 124 +++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_drv.h     |   7 ++
>  5 files changed, 141 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 4e0a26a..bca3c5a 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1569,6 +1569,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  	spin_lock_init(&dev_priv->backlight_lock);
>  	spin_lock_init(&dev_priv->uncore.lock);
>  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> +	spin_lock_init(&dev_priv->mmio_flip_lock);
>  	mutex_init(&dev_priv->dpio_lock);
>  	mutex_init(&dev_priv->modeset_restore_lock);
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 3f62be0..678d31d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1621,6 +1621,10 @@ typedef struct drm_i915_private {
>  	struct i915_dri1_state dri1;
>  	/* Old ums support infrastructure, same warning applies. */
>  	struct i915_ums_state ums;
> +
> +	/* protects the mmio flip data */
> +	spinlock_t mmio_flip_lock;
> +
>  } drm_i915_private_t;
>  
>  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> @@ -2657,6 +2661,9 @@ int i915_reg_read_ioctl(struct drm_device *dev, void *data,
>  int i915_get_reset_stats_ioctl(struct drm_device *dev, void *data,
>  			       struct drm_file *file);
>  
> +void intel_notify_mmio_flip(struct drm_device *dev,
> +			struct intel_ring_buffer *ring);
> +
>  /* overlay */
>  extern struct intel_overlay_error_state *intel_overlay_capture_error_state(struct drm_device *dev);
>  extern void intel_overlay_print_error_state(struct drm_i915_error_state_buf *e,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 4b4aeb3..ad26abe 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1080,6 +1080,8 @@ static void notify_ring(struct drm_device *dev,
>  
>  	trace_i915_gem_request_complete(ring);
>  
> +	intel_notify_mmio_flip(dev, ring);
> +
>  	wake_up_all(&ring->irq_queue);
>  	i915_queue_hangcheck(dev);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 7e4ea8d..19004bf 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -8782,6 +8782,125 @@ err:
>  	return ret;
>  }
>  
> +static int intel_do_mmio_flip(struct drm_device *dev,
> +			struct drm_crtc *crtc)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_crtc *intel_crtc;
> +
> +	intel_crtc = to_intel_crtc(crtc);

nit: could be part of intel_crtc declaration

> +
> +	intel_mark_page_flip_active(intel_crtc);
> +	return dev_priv->display.update_primary_plane(crtc, crtc->fb, 0, 0);

Needs to pass crtc->{x,y} instead of 0,0.

I was a bit worried crtc->fb might be changed already at this point, but
after thinking a bit it should be fine since the presense of unpin_work
will keep intel_crtc_page_flip() from frobbing with it and we always
call intel_crtc_wait_for_pending_flips() before set_base.

Just need to update to use crtc->primary->fb now.

I'm thinking we also have a small race here with a flip done interrupt
from a previous set_base. Probably we need to sort it out using the 
SURFLIVE and/or flip counter like I did for the mmio vs. cs flip
race. But I need to think on this a bit more. Perhaps you want to also
look at those patches a bit.

> +}
> +
> +static bool intel_postpone_flip(struct drm_i915_gem_object *obj)
> +{
> +	int ret;

nit: needs an empty line between declarations and code.

> +	if (!obj->ring)
> +		return false;
> +
> +	if (i915_seqno_passed(obj->ring->get_seqno(obj->ring, false),
> +			      obj->last_write_seqno))

Maybe this should just do a very light check using lazy_coherency?

> +		return false;
> +
> +	if (obj->last_write_seqno == obj->ring->outstanding_lazy_seqno) {
> +		ret = i915_add_request(obj->ring, NULL);
> +		if (WARN_ON(ret))
> +			return false;
> +	}

Looks like i915_gem_check_olr(), so maybe make it non-static and use
here.

> +
> +	if (WARN_ON(!obj->ring->irq_get(obj->ring)))
> +		return false;

Maybe do this before the checking outstanding_lazy_seqno. Hmm, or
perhaps not. It won't actaully eliminate the race entirely so you
still have to do the manual check later. I guess having it in this
order keeps the error paths simpler.

> +
> +	return true;
> +}
> +
> +void intel_notify_mmio_flip(struct drm_device *dev,
> +			struct intel_ring_buffer *ring)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_crtc *crtc;
> +	struct intel_crtc *intel_crtc;
> +	struct intel_mmio_flip *mmio_flip_data;
> +	unsigned long irq_flags;
> +	u32 seqno;
> +	enum pipe pipe;
> +
> +	BUG_ON(!ring);

No need for the BUG. It'll explode on the next line anyway if
ring==NULL.

> +
> +	seqno = ring->get_seqno(ring, false);
> +`
> +	spin_lock_irqsave(&dev_priv->mmio_flip_lock, irq_flags);
> +	for_each_pipe(pipe) {

list_for_each_entry() seems more appropriate since you don't use the
pipe variable for anything else besides looking up the crtc.

You could also move some of the variable declarations inside the loop
since they're not needed on the outside.

> +		crtc = dev_priv->pipe_to_crtc_mapping[pipe];
> +		intel_crtc = to_intel_crtc(crtc);
> +		mmio_flip_data = &intel_crtc->mmio_flip_data;
> +		if ((mmio_flip_data->seqno != 0) &&
> +				(ring->id == mmio_flip_data->ring_id) &&
> +				(seqno >= mmio_flip_data->seqno)) {

Weird indentation and a bit too many parens for my taste.

Should also use i915_seqno_passed() here.

Special casing seqno 0 this way seems safe enough since we apparently
skip seqno 0 always.

> +			intel_do_mmio_flip(dev, crtc);
> +			mmio_flip_data->seqno = 0;
> +			ring->irq_put(ring);
> +		}
> +	}
> +
> +	spin_unlock_irqrestore(&dev_priv->mmio_flip_lock, irq_flags);
> +}
> +
> +/* Using MMIO based flips starting from VLV, for Media power well
> + * residency optimization. The other alternative of having Render
> + * ring based flip calls is not being used, as the performance
> + * (FPS) of certain 3D Apps was getting severly affected.
> + */
> +static int intel_gen7_queue_mmio_flip(struct drm_device *dev,
> +			struct drm_crtc *crtc,
> +			struct drm_framebuffer *fb,
> +			struct drm_i915_gem_object *obj,
> +			uint32_t flags)

There's nothing gen7 specific here. So you could just rename the
function to eg. intel_queue_mmio_flip(). Maybe also move the
comment about VLV to where you set up the function pointer.

> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
> +	unsigned long irq_flags;
> +	int ret;
> +
> +	ret = intel_pin_and_fence_fb_obj(dev, obj, obj->ring);
> +	if (ret)
> +		goto err;
> +
> +	switch (intel_crtc->plane) {
> +	case PLANE_A:
> +	case PLANE_B:
> +	case PLANE_C:
> +	break;
> +	default:
> +		WARN_ONCE(1, "unknown plane in flip command\n");
> +		ret = -ENODEV;
> +		goto err_unpin;
> +	}

I think you can drop this plane check. We should hopefully catch such
bugs elsewhere already.

> +
> +	if (!intel_postpone_flip(obj)) {
> +		ret = intel_do_mmio_flip(dev, crtc);
> +		return ret;

Just 'return intel_do_mmio_flip(...'

Actually I think you can just make intel_do_mmio_flip() void since
.update_primary_plane() can't really fail. I think Daniel even has a
patch lined up to make it void as well.

> +	}
> +
> +	spin_lock_irqsave(&dev_priv->mmio_flip_lock, irq_flags);
> +	intel_crtc->mmio_flip_data.seqno = obj->last_write_seqno;
> +	intel_crtc->mmio_flip_data.ring_id = obj->ring->id;
> +	spin_unlock_irqrestore(&dev_priv->mmio_flip_lock, irq_flags);
> +
> +	/* Double check to catch cases where irq fired before
> +	 * mmio flip data was ready
> +	 */
> +	intel_notify_mmio_flip(dev, obj->ring);
> +	return 0;
> +
> +err_unpin:
> +	intel_unpin_fb_obj(obj);
> +err:
> +	return ret;
> +}
> +
>  static int intel_gen7_queue_flip(struct drm_device *dev,
>  				 struct drm_crtc *crtc,
>  				 struct drm_framebuffer *fb,
> @@ -10577,6 +10696,8 @@ static void intel_crtc_init(struct drm_device *dev, int pipe)
>  		DRM_DEBUG_KMS("swapping pipes & planes for FBC\n");
>  		intel_crtc->plane = !pipe;
>  	}
> +	if (IS_VALLEYVIEW(dev))
> +			intel_crtc->mmio_flip_data.seqno = 0;

Not needed. It's kzalloced so everything starts zeroed.

>  
>  	BUG_ON(pipe >= ARRAY_SIZE(dev_priv->plane_to_crtc_mapping) ||
>  	       dev_priv->plane_to_crtc_mapping[intel_crtc->plane] != NULL);
> @@ -11107,6 +11228,9 @@ static void intel_init_display(struct drm_device *dev)
>  	}
>  
>  	intel_panel_init_backlight_funcs(dev);
> +
> +	if (IS_VALLEYVIEW(dev))
> +		dev_priv->display.queue_flip = intel_gen7_queue_mmio_flip;

Would look a bit cleaner to do this before
intel_panel_init_backlight_funcs(). That will keep the .queue_flip
assignments closer together.

I'm also thinking that maybe we want a module parameter to select
between CS vs. mmio flips. It could default to mmio flips on VLV
and CS flips on other platforms if it's generally accepted that mmio
flips are better on VLV. That can at least get us a bit more testing
coverage if people don't need a specific platform to try it out.

On the whole I think it looks fairly good, and it should be fairly easy
to extend it more for nuclear flips.

>  }
>  
>  /*
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index fa99104..f0b26a1 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -344,6 +344,11 @@ struct intel_pipe_wm {
>  	bool fbc_wm_enabled;
>  };
>  
> +struct intel_mmio_flip {
> +	u32 seqno;
> +	u32 ring_id;
> +};
> +
>  struct intel_crtc {
>  	struct drm_crtc base;
>  	enum pipe pipe;
> @@ -395,6 +400,8 @@ struct intel_crtc {
>  		/* watermarks currently being used  */
>  		struct intel_pipe_wm active;
>  	} wm;
> +
> +	struct intel_mmio_flip	mmio_flip_data;
>  };
>  
>  struct intel_plane_wm_parameters {
> -- 
> 1.8.5.1

-- 
Ville Syrjälä
Intel OTC