[Intel-gfx] [PATCH 4/6] drm/i915/ringbuffer: Move irq seqno barrier to the GPU for gen7
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Mon Dec 31 10:43:07 UTC 2018
On 28/12/2018 17:16, Chris Wilson wrote:
> The irq_seqno_barrier is a tradeoff between doing work on every request
> (on the GPU) and doing work after every interrupt (on the CPU). We
> presume we have many more requests than interrupts! However, the current
> w/a for Ivybridge is an implicit delay that currently fails sporadically
> and consistently if we move the w/a into the irq handler itself. This
> makes the CPU barrier untenable for upcoming interrupt handler changes
> and so we need to replace it with a delay on the GPU before we send the
> MI_USER_INTERRUPT. As it turns out that delay is 32x MI_STORE_DWORD_IMM,
> or about 0.6us per request! Quite nasty, but the lesser of two evils
> looking to the future.
>
> Testcase: igt/gem_sync
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/intel_ringbuffer.c | 80 ++++++++++++++-----------
> 1 file changed, 44 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 2fb3a364c390..dd996103d495 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -443,6 +443,34 @@ static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> }
> static const int gen6_xcs_emit_breadcrumb_sz = 4;
>
> +#define GEN7_XCS_WA 32
> +static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> +{
> + int i;
> +
> + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
> + *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
> + *cs++ = rq->global_seqno;
> +
> + for (i = 0; i < GEN7_XCS_WA; i++) {
> + *cs++ = MI_STORE_DWORD_INDEX;
> + *cs++ = I915_GEM_HWS_INDEX_ADDR;
> + *cs++ = rq->global_seqno;
> + }
> +
> + *cs++ = MI_FLUSH_DW;
> + *cs++ = 0;
> + *cs++ = 0;
> +
> + *cs++ = MI_USER_INTERRUPT;
> + *cs++ = MI_NOOP;
> +
> + rq->tail = intel_ring_offset(rq, cs);
> + assert_ring_tail_valid(rq->ring, rq->tail);
> +}
> +static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3;
Is it worth moving this before the function, and then in the function
have a little GEM_BUG_ON using cs pointer arithmetic to check the two
sizes match? Or even simpler the function could use
engine->emit_breadcrumb_sz to check against.
> +#undef GEN7_XCS_WA
> +
> static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
> {
> /*
> @@ -874,31 +902,6 @@ gen5_seqno_barrier(struct intel_engine_cs *engine)
> usleep_range(125, 250);
> }
>
> -static void
> -gen6_seqno_barrier(struct intel_engine_cs *engine)
> -{
> - struct drm_i915_private *dev_priv = engine->i915;
> -
> - /* Workaround to force correct ordering between irq and seqno writes on
> - * ivb (and maybe also on snb) by reading from a CS register (like
> - * ACTHD) before reading the status page.
> - *
> - * Note that this effectively stalls the read by the time it takes to
> - * do a memory transaction, which more or less ensures that the write
> - * from the GPU has sufficient time to invalidate the CPU cacheline.
> - * Alternatively we could delay the interrupt from the CS ring to give
> - * the write time to land, but that would incur a delay after every
> - * batch i.e. much more frequent than a delay when waiting for the
> - * interrupt (with the same net latency).
> - *
> - * Also note that to prevent whole machine hangs on gen7, we have to
> - * take the spinlock to guard against concurrent cacheline access.
> - */
> - spin_lock_irq(&dev_priv->uncore.lock);
> - POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
> - spin_unlock_irq(&dev_priv->uncore.lock);
> -}
> -
> static void
> gen5_irq_enable(struct intel_engine_cs *engine)
> {
> @@ -2258,10 +2261,13 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
> engine->emit_flush = gen6_bsd_ring_flush;
> engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
>
> - engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> - engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> - if (!IS_GEN(dev_priv, 6))
> - engine->irq_seqno_barrier = gen6_seqno_barrier;
> + if (IS_GEN(dev_priv, 6)) {
> + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> + } else {
> + engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
> + engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
> + }
> } else {
> engine->emit_flush = bsd_ring_flush;
> if (IS_GEN(dev_priv, 5))
> @@ -2284,10 +2290,13 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
> engine->emit_flush = gen6_ring_flush;
> engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
>
> - engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> - engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> - if (!IS_GEN(dev_priv, 6))
> - engine->irq_seqno_barrier = gen6_seqno_barrier;
> + if (IS_GEN(dev_priv, 6)) {
> + engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> + engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> + } else {
> + engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
> + engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
> + }
>
> return intel_init_ring_buffer(engine);
> }
> @@ -2305,9 +2314,8 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
> engine->irq_enable = hsw_vebox_irq_enable;
> engine->irq_disable = hsw_vebox_irq_disable;
>
> - engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
> - engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
> - engine->irq_seqno_barrier = gen6_seqno_barrier;
> + engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
> + engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
>
> return intel_init_ring_buffer(engine);
> }
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list