[Intel-gfx] [PATCH 12/20] drm/i915: Add a delay between interrupt and inspecting the final seqno (ilk)
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Jul 1 14:27:40 UTC 2016
On 01/07/16 12:22, Chris Wilson wrote:
> On Ironlake, there is no command nor register to ensure that the write
> from a MI_STORE command is completed (and coherent on the CPU) before the
> command parser continues. This means that the ordering between the seqno
Command *streamer* I think. (More instances below.)
> write and the subsequent user interrupt is undefined (like gen6+). So to
> ensure that the seqno write is completed after the final user interrupt
> we need to delay the read sufficiently to allow the write to complete.
> This delay is undefined by the bspec, and empirically requires 75us even
> though a register read combined with a clflush is less than 500ns. Hence,
> the delay is due to an on-chip buffer rather than the latency of the write
> to memory.
>
> Note that the render ring controls this by filling the PIPE_CONTROL fifo
> with stalling commands that force the earliest pipe-control with the
> seqno to be completed before the command parser continues. Given that we
> need a barrier operation for BSD, we may as well forgo the extra
> per-batch latency by using a common per-interrupt barrier.
>
> Studying the impact of adding the usleep shows that in both sequences of
> and individual synchronous no-op batches is negligible for the media
> engine (where the write now is unordered with the interrupt). Converting
> the render engine over from the current glutton of pie-controls over to
> the per-interrupt delays speeds up both the sequential and individual
> synchronous no-ops by 20% and 60%, respectively. This speed up holds
> even when looking at the throughput of small copies (4KiB->4MiB), both
> serial and synchronous, by about 20%. This is because despite adding a
> significant delay to the interrupt, in all likelihood we will see the
> seqno write without having to apply the barrier (only in the rare corner
> cases where the write is delayed on the last required is the delay
> necessary).
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94307
> Testcase: igt/gem_sync #ilk
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 10 ++--
> drivers/gpu/drm/i915/intel_ringbuffer.c | 86 ++++++++-------------------------
> 2 files changed, 23 insertions(+), 73 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 7c379afcff2f..be7f0b9b27e0 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1264,8 +1264,7 @@ static void ivybridge_parity_error_irq_handler(struct drm_i915_private *dev_priv
> static void ilk_gt_irq_handler(struct drm_i915_private *dev_priv,
> u32 gt_iir)
> {
> - if (gt_iir &
> - (GT_RENDER_USER_INTERRUPT | GT_RENDER_PIPECTL_NOTIFY_INTERRUPT))
> + if (gt_iir & GT_RENDER_USER_INTERRUPT)
> notify_ring(&dev_priv->engine[RCS]);
> if (gt_iir & ILK_BSD_USER_INTERRUPT)
> notify_ring(&dev_priv->engine[VCS]);
> @@ -1274,9 +1273,7 @@ static void ilk_gt_irq_handler(struct drm_i915_private *dev_priv,
> static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
> u32 gt_iir)
> {
> -
> - if (gt_iir &
> - (GT_RENDER_USER_INTERRUPT | GT_RENDER_PIPECTL_NOTIFY_INTERRUPT))
> + if (gt_iir & GT_RENDER_USER_INTERRUPT)
> notify_ring(&dev_priv->engine[RCS]);
> if (gt_iir & GT_BSD_USER_INTERRUPT)
> notify_ring(&dev_priv->engine[VCS]);
> @@ -3601,8 +3598,7 @@ static void gen5_gt_irq_postinstall(struct drm_device *dev)
>
> gt_irqs |= GT_RENDER_USER_INTERRUPT;
> if (IS_GEN5(dev)) {
> - gt_irqs |= GT_RENDER_PIPECTL_NOTIFY_INTERRUPT |
> - ILK_BSD_USER_INTERRUPT;
> + gt_irqs |= ILK_BSD_USER_INTERRUPT;
> } else {
> gt_irqs |= GT_BLT_USER_INTERRUPT | GT_BSD_USER_INTERRUPT;
> }
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index f89b1797b465..d919e72f1328 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1593,67 +1593,22 @@ gen6_ring_sync(struct drm_i915_gem_request *waiter_req,
> return 0;
> }
>
> -#define PIPE_CONTROL_FLUSH(ring__, addr__) \
> -do { \
> - intel_ring_emit(ring__, GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE | \
> - PIPE_CONTROL_DEPTH_STALL); \
> - intel_ring_emit(ring__, (addr__) | PIPE_CONTROL_GLOBAL_GTT); \
> - intel_ring_emit(ring__, 0); \
> - intel_ring_emit(ring__, 0); \
> -} while (0)
> -
> -static int
> -pc_render_add_request(struct drm_i915_gem_request *req)
> +static void
> +gen5_seqno_barrier(struct intel_engine_cs *ring)
> {
> - struct intel_engine_cs *engine = req->engine;
> - u32 addr = engine->status_page.gfx_addr +
> - (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
> - u32 scratch_addr = addr;
> - int ret;
> -
> - /* For Ironlake, MI_USER_INTERRUPT was deprecated and apparently
> - * incoherent with writes to memory, i.e. completely fubar,
> - * so we need to use PIPE_NOTIFY instead.
> + /* MI_STORE are internally buffered by the GPU and not flushed
> + * either by MI_FLUSH or SyncFlush or any other combination of
> + * MI commands.
> *
> - * However, we also need to workaround the qword write
> - * incoherence by flushing the 6 PIPE_NOTIFY buffers out to
> - * memory before requesting an interrupt.
> + * "Only the submission of the store operation is guaranteed.
> + * The write result will be complete (coherent) some time later
> + * (this is practically a finite period but there is no guaranteed
> + * latency)."
> + *
> + * Empirically, we observe that we need a delay of at least 75us to
> + * be sure that the seqno write is visible by the CPU.
> */
> - ret = intel_ring_begin(req, 32);
> - if (ret)
> - return ret;
> -
> - intel_ring_emit(engine,
> - GFX_OP_PIPE_CONTROL(4) |
> - PIPE_CONTROL_QW_WRITE |
> - PIPE_CONTROL_WRITE_FLUSH |
> - PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
> - intel_ring_emit(engine, addr | PIPE_CONTROL_GLOBAL_GTT);
> - intel_ring_emit(engine, req->seqno);
> - intel_ring_emit(engine, 0);
> - PIPE_CONTROL_FLUSH(engine, scratch_addr);
> - scratch_addr += 2 * CACHELINE_BYTES; /* write to separate cachelines */
> - PIPE_CONTROL_FLUSH(engine, scratch_addr);
> - scratch_addr += 2 * CACHELINE_BYTES;
> - PIPE_CONTROL_FLUSH(engine, scratch_addr);
> - scratch_addr += 2 * CACHELINE_BYTES;
> - PIPE_CONTROL_FLUSH(engine, scratch_addr);
> - scratch_addr += 2 * CACHELINE_BYTES;
> - PIPE_CONTROL_FLUSH(engine, scratch_addr);
> - scratch_addr += 2 * CACHELINE_BYTES;
> - PIPE_CONTROL_FLUSH(engine, scratch_addr);
> -
> - intel_ring_emit(engine,
> - GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE |
> - PIPE_CONTROL_WRITE_FLUSH |
> - PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
> - PIPE_CONTROL_NOTIFY);
> - intel_ring_emit(engine, addr | PIPE_CONTROL_GLOBAL_GTT);
> - intel_ring_emit(engine, req->seqno);
> - intel_ring_emit(engine, 0);
> - __intel_ring_advance(engine);
> -
> - return 0;
> + usleep_range(125, 250);
> }
>
> static void
> @@ -2964,6 +2919,7 @@ static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
> } else if (INTEL_GEN(dev_priv) >= 5) {
> engine->irq_get = gen5_ring_get_irq;
> engine->irq_put = gen5_ring_put_irq;
> + engine->irq_seqno_barrier = gen5_seqno_barrier;
> } else if (INTEL_GEN(dev_priv) >= 3) {
> engine->irq_get = i9xx_ring_get_irq;
> engine->irq_put = i9xx_ring_put_irq;
> @@ -3012,11 +2968,12 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
>
> intel_ring_default_vfuncs(dev_priv, engine);
>
> + engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
> +
> if (INTEL_GEN(dev_priv) >= 8) {
> engine->init_context = intel_rcs_ctx_init;
> engine->add_request = gen8_render_add_request;
> engine->flush = gen8_render_ring_flush;
> - engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
> if (i915_semaphore_is_enabled(dev_priv))
> engine->semaphore.signal = gen8_rcs_signal;
> } else if (INTEL_GEN(dev_priv) >= 6) {
> @@ -3024,12 +2981,8 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
> engine->flush = gen7_render_ring_flush;
> if (IS_GEN6(dev_priv))
> engine->flush = gen6_render_ring_flush;
> - engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
> } else if (IS_GEN5(dev_priv)) {
> - engine->add_request = pc_render_add_request;
> engine->flush = gen4_render_ring_flush;
> - engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT |
> - GT_RENDER_PIPECTL_NOTIFY_INTERRUPT;
> } else {
> if (INTEL_GEN(dev_priv) < 4)
> engine->flush = gen2_render_ring_flush;
> @@ -3048,7 +3001,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
> if (ret)
> return ret;
>
> - if (INTEL_GEN(dev_priv) >= 5) {
> + if (INTEL_GEN(dev_priv) >= 6) {
> ret = intel_init_pipe_control(engine, 4096);
> if (ret)
> return ret;
> @@ -3087,9 +3040,10 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
> } else {
> engine->mmio_base = BSD_RING_BASE;
> engine->flush = bsd_ring_flush;
> - if (IS_GEN5(dev_priv))
> + if (IS_GEN5(dev_priv)) {
> engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
> - else
> + engine->irq_seqno_barrier = gen5_seqno_barrier;
This is already set in common setup AFAICS.
> + } else
> engine->irq_enable_mask = I915_BSD_USER_INTERRUPT;
> }
>
>
Otherwise looks good to me.
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list