[Intel-gfx] [PATCH v3 5/5] drm/i915/execlists: Read the context-status HEAD from the HWSP

Thu Jul 13 18:12:34 UTC 2017

On 13/07/17 03:27, Chris Wilson wrote:
> The engine also provides a mirror of the CSB write pointer in the HWSP,
> but not of our read pointer. To take advantage of this we need to
> remember where we read up to on the last interrupt and continue off from
> there. This poses a problem following a reset, as we don't know where
> the hw will start writing from, and due to the use of power contexts we
> cannot perform that query during the reset itself. So we continue the
> current modus operandi of delaying the first read of the context-status
> read/write pointers until after the first interrupt. With this we should
> now have eliminated all uncached mmio reads in handling the
> context-status interrupt, though we still have the uncached mmio writes
> for submitting new work, and many uncached mmio reads in the global
> interrupt handler itself. Still a step in the right direction towards
> reducing our resubmit latency, although it appears lost in the noise!
> 
> v2: Cannonlake moved the CSB write index
> v3: Include the sw/hwsp state in debugfs/i915_engine_info
> v4: Also revert to using CSB mmio for GVT-g
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Michel Thierry <michel.thierry at intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Zhenyu Wang <zhenyuw at linux.intel.com>
> Cc: Zhi Wang <zhi.a.wang at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c     |  6 ++++--
>   drivers/gpu/drm/i915/i915_drv.h         |  8 ++++++++
>   drivers/gpu/drm/i915/intel_lrc.c        | 25 ++++++++++++++++++++-----
>   drivers/gpu/drm/i915/intel_ringbuffer.h |  3 +++
>   4 files changed, 35 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 5fd01c14a3ec..552aef61b47b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3395,8 +3395,10 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>   			ptr = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
>   			read = GEN8_CSB_READ_PTR(ptr);
>   			write = GEN8_CSB_WRITE_PTR(ptr);
> -			seq_printf(m, "\tExeclist CSB read %d, write %d\n",
> -				   read, write);
> +			seq_printf(m, "\tExeclist CSB read %d [%d cached], write %d [%d from hws]\n",
> +				   read, engine->csb_head,
> +				   write,
> +				   intel_read_status_page(engine, intel_hws_csb_write_index(engine->i915)));
>   			if (read >= GEN8_CSB_ENTRIES)
>   				read = 0;
>   			if (write >= GEN8_CSB_ENTRIES)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 81cd21ecfa7d..f62c9db8a9a8 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -4228,4 +4228,12 @@ static inline bool i915_gem_object_is_coherent(struct drm_i915_gem_object *obj)
>   		HAS_LLC(to_i915(obj->base.dev)));
>   }
>   
> +static inline int intel_hws_csb_write_index(struct drm_i915_private *i915)
> +{
> +	if (INTEL_GEN(i915) >= 10)
> +		return CNL_HWS_CSB_WRITE_INDEX;
> +	else
> +		return I915_HWS_CSB_WRITE_INDEX;
> +}
> +
>   #endif
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 5b721f65d232..7c3dce27e504 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -556,6 +556,7 @@ static void intel_lrc_irq_handler(unsigned long data)
>   		if (unlikely(intel_vgpu_active(dev_priv))) {
>   			buf = (u32 * __force)
>   				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)));
> +			engine->csb_head = -1;

I would add a comment of why you have to change csb_head, something like:

			engine->csb_head = -1; /* force CSB via mmio */

>   		}
>   
>   		/* The write will be ordered by the uncached read (itself

Looks good to me and it works in skl. The news about bxt are promising.

Since this, and the patch before it, modify the behaviour for all GEN8+, 
one (or more) set of eyes would be good.

Acked-by: Michel Thierry <michel.thierry at intel.com>

> @@ -569,9 +570,19 @@ static void intel_lrc_irq_handler(unsigned long data)
>   		 * is set and we do a new loop.
>   		 */
>   		__clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -		head = readl(csb_mmio);
> -		tail = GEN8_CSB_WRITE_PTR(head);
> -		head = GEN8_CSB_READ_PTR(head);
> +		if (unlikely(engine->csb_head == -1)) { /* following a reset */
> +			head = readl(csb_mmio);
> +			tail = GEN8_CSB_WRITE_PTR(head);
> +			head = GEN8_CSB_READ_PTR(head);
> +			engine->csb_head = head;
> +		} else {
> +			const int write_idx =
> +				intel_hws_csb_write_index(dev_priv) -
> +				I915_HWS_CSB_BUF0_INDEX;
> +
> +			head = engine->csb_head;
> +			tail = buf[write_idx];
> +		}
>   		while (head != tail) {
>   			struct drm_i915_gem_request *rq;
>   			unsigned int status;
> @@ -625,8 +636,11 @@ static void intel_lrc_irq_handler(unsigned long data)
>   				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>   		}
>   
> -		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> -		       csb_mmio);
> +		if (head != engine->csb_head) {
> +			engine->csb_head = head;
> +			writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8),
> +			       csb_mmio);
> +		}
>   	}
>   
>   	if (execlists_elsp_ready(engine))
> @@ -1253,6 +1267,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>   
>   	/* After a GPU reset, we may have requests to replay */
>   	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> +	engine->csb_head = -1;
>   
>   	submit = false;
>   	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 2c55cfa14fb5..a182da7eb9a9 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -391,6 +391,7 @@ struct intel_engine_cs {
>   	struct rb_root execlist_queue;
>   	struct rb_node *execlist_first;
>   	unsigned int fw_domains;
> +	unsigned int csb_head;
>   
>   	/* Contexts are pinned whilst they are active on the GPU. The last
>   	 * context executed remains active whilst the GPU is idle - the
> @@ -497,6 +498,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>   #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
>   
>   #define I915_HWS_CSB_BUF0_INDEX		0x10
> +#define I915_HWS_CSB_WRITE_INDEX	0x1f
> +#define CNL_HWS_CSB_WRITE_INDEX		0x2f
>   
>   struct intel_ring *
>   intel_engine_create_ring(struct intel_engine_cs *engine, int size);
>