[Intel-gfx] [PATCH v4] drm/i915: Replaced Blitter ring based flips with MMIO flips for VLV

Thu May 15 14:27:46 CEST 2014

On Thu, May 15, 2014 at 11:47:37AM +0530, sourab.gupta at intel.com wrote:
> From: Sourab Gupta <sourab.gupta at intel.com>
> 
> Using MMIO based flips on Gen5+ for Media power well residency optimization.
> The blitter ring is currently being used just for command streamer based
> flip calls. For pure 3D workloads, with MMIO flips, there will be no use
> of blitter ring and this will ensure the 100% residency for Media well.
> 
> In a subsequent patch, we can make the selection between CS vs MMIO flip
> based on a module parameter to give more testing coverage.
> 
> v2: The MMIO flips now use the interrupt driven mechanism for issuing the
> flips when target seqno is reached. (Incorporating Ville's idea)
> 
> v3: Rebasing on latest code. Code restructuring after incorporating
> Damien's comments
> 
> v4: Addressing Ville's review comments
>     -general cleanup
>     -updating only base addr instead of calling update_primary_plane
>     -extending patch for gen5+ platforms
> 
> Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
> Signed-off-by: Akash Goel <akash.goel at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c      |   1 +
>  drivers/gpu/drm/i915/i915_drv.h      |   6 ++
>  drivers/gpu/drm/i915/i915_gem.c      |   2 +-
>  drivers/gpu/drm/i915/i915_irq.c      |   2 +
>  drivers/gpu/drm/i915/intel_display.c | 115 +++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_drv.h     |   6 ++
>  6 files changed, 131 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 46f1dec..672c28f 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1570,6 +1570,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  	spin_lock_init(&dev_priv->backlight_lock);
>  	spin_lock_init(&dev_priv->uncore.lock);
>  	spin_lock_init(&dev_priv->mm.object_stat_lock);
> +	spin_lock_init(&dev_priv->mmio_flip_lock);
>  	dev_priv->ring_index = 0;
>  	mutex_init(&dev_priv->dpio_lock);
>  	mutex_init(&dev_priv->modeset_restore_lock);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4006dfe..38c0820 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1543,6 +1543,8 @@ struct drm_i915_private {
>  	struct i915_ums_state ums;
>  	/* the indicator for dispatch video commands on two BSD rings */
>  	int ring_index;
> +	/* protects the mmio flip data */
> +	spinlock_t mmio_flip_lock;
>  };
>  
>  static inline struct drm_i915_private *to_i915(const struct drm_device *dev)
> @@ -2209,6 +2211,7 @@ bool i915_gem_retire_requests(struct drm_device *dev);
>  void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
>  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
>  				      bool interruptible);
> +int i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno);
>  static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
>  {
>  	return unlikely(atomic_read(&error->reset_counter)
> @@ -2580,6 +2583,9 @@ int i915_reg_read_ioctl(struct drm_device *dev, void *data,
>  int i915_get_reset_stats_ioctl(struct drm_device *dev, void *data,
>  			       struct drm_file *file);
>  
> +void intel_notify_mmio_flip(struct drm_device *dev,
> +			struct intel_ring_buffer *ring);
> +
>  /* overlay */
>  extern struct intel_overlay_error_state *intel_overlay_capture_error_state(struct drm_device *dev);
>  extern void intel_overlay_print_error_state(struct drm_i915_error_state_buf *e,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index fa5b5ab..5b4e953 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -975,7 +975,7 @@ i915_gem_check_wedge(struct i915_gpu_error *error,
>   * Compare seqno against outstanding lazy request. Emit a request if they are
>   * equal.
>   */
> -static int
> +int
>  i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
>  {
>  	int ret;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index b10fbde..a353693 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1084,6 +1084,8 @@ static void notify_ring(struct drm_device *dev,
>  
>  	trace_i915_gem_request_complete(ring);
>  
> +	intel_notify_mmio_flip(dev, ring);
> +

Hmm. How badly is this going to explode with UMS?

>  	wake_up_all(&ring->irq_queue);
>  	i915_queue_hangcheck(dev);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 0f8f9bc..9d190c2 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9037,6 +9037,110 @@ err:
>  	return ret;
>  }
>  
> +static void intel_do_mmio_flip(struct drm_device *dev,
> +			struct drm_crtc *crtc)

nit: no need to pass dev and crtc. You can dig out dev_priv from
the crtc in the function.

Passing intel_crtc instead of drm_crtc could also avoid some extra
variables.

> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_i915_gem_object *obj;
> +	struct intel_framebuffer *intel_fb;
> +	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
> +	int plane = intel_crtc->plane;
> +
> +	intel_mark_page_flip_active(intel_crtc);
> +
> +	intel_fb = to_intel_framebuffer(crtc->primary->fb);
> +	obj = intel_fb->obj;

nit: could be done as part of the declaration

> +
> +	/* Update the base address reg for the plane */

Useless comment.

> +	I915_WRITE(DSPSURF(plane), i915_gem_obj_ggtt_offset(obj) +
> +			intel_crtc->dspaddr_offset);

Should probably have a POSTING_READ() here.

> +}
> +
> +static bool intel_postpone_flip(struct drm_i915_gem_object *obj)
> +{
> +	if (!obj->ring)
> +		return false;
> +
> +	if (i915_seqno_passed(obj->ring->get_seqno(obj->ring, false),

Still not using lazy_coherency.

> +				obj->last_write_seqno))
> +		return false;
> +
> +	i915_gem_check_olr(obj->ring, obj->last_write_seqno);

Error handling is missing.

In fact if we get an error from this I guess we should fail
.queue_flip(). A bool return isn't sufficient to convey both
errors and whether we need to wait for the GPU or not. I guess
you could encode that into an int with >0,==0,<0 meaning
different things, or you may just need to inline this stuff
into intel_queue_mmio_flip().

> +
> +	if (WARN_ON(!obj->ring->irq_get(obj->ring)))
> +		return false;
> +
> +	return true;
> +}
> +
> +void intel_notify_mmio_flip(struct drm_device *dev,
> +			struct intel_ring_buffer *ring)

Again could just pass ring and dig out dev from it.

> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct drm_crtc *crtc;
> +	unsigned long irq_flags;
> +	u32 seqno;
> +
> +	seqno = ring->get_seqno(ring, false);
> +
> +	spin_lock_irqsave(&dev_priv->mmio_flip_lock, irq_flags);
> +	for_each_crtc(dev, crtc) {
> +		struct intel_crtc *intel_crtc;

Could use for_each_intel_crtc() to avoid some extra variables.

> +		struct intel_mmio_flip *mmio_flip_data;
> +
> +		intel_crtc = to_intel_crtc(crtc);
> +		mmio_flip_data = &intel_crtc->mmio_flip_data;
> +
> +		if (mmio_flip_data->seqno == 0)
> +			continue;
> +		if (ring->id != mmio_flip_data->ring_id)
> +			continue;
> +
> +		if (i915_seqno_passed(seqno, mmio_flip_data->seqno)) {
> +			intel_do_mmio_flip(dev, crtc);
> +			mmio_flip_data->seqno = 0;
> +			ring->irq_put(ring);
> +		}
> +	}
> +
> +	spin_unlock_irqrestore(&dev_priv->mmio_flip_lock, irq_flags);
> +}
> +
> +static int intel_queue_mmio_flip(struct drm_device *dev,
> +			struct drm_crtc *crtc,
> +			struct drm_framebuffer *fb,
> +			struct drm_i915_gem_object *obj,
> +			uint32_t flags)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
> +	unsigned long irq_flags;
> +	int ret;
> +
> +	ret = intel_pin_and_fence_fb_obj(dev, obj, obj->ring);
> +	if (ret)
> +		goto err;
> +
> +	if (!intel_postpone_flip(obj)) {
> +		intel_do_mmio_flip(dev, crtc);
> +		return 0;
> +	}
> +
> +	spin_lock_irqsave(&dev_priv->mmio_flip_lock, irq_flags);
> +	intel_crtc->mmio_flip_data.seqno = obj->last_write_seqno;
> +	intel_crtc->mmio_flip_data.ring_id = obj->ring->id;
> +	spin_unlock_irqrestore(&dev_priv->mmio_flip_lock, irq_flags);
> +
> +	/* Double check to catch cases where irq fired before
> +	 * mmio flip data was ready
> +	 */
> +	intel_notify_mmio_flip(dev, obj->ring);
> +	return 0;
> +
> +err:
> +	return ret;
> +}
> +
>  static int intel_gen7_queue_flip(struct drm_device *dev,
>  				 struct drm_crtc *crtc,
>  				 struct drm_framebuffer *fb,
> @@ -11377,7 +11481,18 @@ static void intel_init_display(struct drm_device *dev)
>  		break;
>  	}
>  
> +	/* Using MMIO based flips starting from Gen5, for Media power well
> +	 * residency optimization. This is not currently being used for
> +	 * older platforms because of non-availability of flip done interrupt.
> +	 * The other alternative of having Render ring based flip calls is
> +	 * not being used, as the performance(FPS) of certain 3D Apps gets
> +	 * severly affected.
> +	 */
> +	if (INTEL_INFO(dev)->gen >= 5)
> +		dev_priv->display.queue_flip = intel_queue_mmio_flip;

I still think we need a module param for this, and default to CS for
now (except maybe for VLV).

> +
>  	intel_panel_init_backlight_funcs(dev);
> +
>  }
>  
>  /*
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index 32a74e1..7953ed6 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -351,6 +351,11 @@ struct intel_pipe_wm {
>  	bool sprites_scaled;
>  };
>  
> +struct intel_mmio_flip {
> +	u32 seqno;
> +	u32 ring_id;
> +};
> +
>  struct intel_crtc {
>  	struct drm_crtc base;
>  	enum pipe pipe;
> @@ -403,6 +408,7 @@ struct intel_crtc {
>  	} wm;
>  
>  	wait_queue_head_t vbl_wait;
> +	struct intel_mmio_flip	mmio_flip_data;
                              ^

Should be a space, not tab.

>  };
>  
>  struct intel_plane_wm_parameters {
> -- 
> 1.8.5.1

-- 
Ville Syrjälä
Intel OTC