[Intel-gfx] [PATCH 19/23] drm/i915: Track the context's seqno in its own timeline HWSP
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Jan 18 14:10:35 UTC 2019
On 17/01/2019 14:35, Chris Wilson wrote:
> Now that we have allocated ourselves a cacheline to store a breadcrumb,
> we can emit a write from the GPU into the timeline's HWSP of the
> per-context seqno as we complete each request. This drops the mirroring
> of the per-engine HWSP and allows each context to operate independently.
> We do not need to unwind the per-context timeline, and so requests are
> always consistent with the timeline breadcrumb, greatly simplifying the
> completion checks as we no longer need to be concerned about the
> global_seqno changing mid check.
>
> At this point, we are emitting both per-context and global seqno and
> still using the single per-engine execution timeline for resolving
> interrupts.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_gem.c | 2 +-
> drivers/gpu/drm/i915/i915_request.c | 2 +-
> drivers/gpu/drm/i915/i915_request.h | 27 ++----
> drivers/gpu/drm/i915/i915_reset.c | 1 +
> drivers/gpu/drm/i915/i915_vma.h | 7 ++
> drivers/gpu/drm/i915/intel_lrc.c | 32 ++++---
> drivers/gpu/drm/i915/intel_ringbuffer.c | 91 ++++++++++++++------
> drivers/gpu/drm/i915/selftests/mock_engine.c | 8 +-
> 8 files changed, 109 insertions(+), 61 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 3c6091021290..a5bd51987c0d 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2890,7 +2890,7 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
> */
> spin_lock_irqsave(&engine->timeline.lock, flags);
> list_for_each_entry(request, &engine->timeline.requests, link) {
> - if (__i915_request_completed(request, request->global_seqno))
> + if (i915_request_completed(request))
> continue;
>
> active = request;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 7d068c406a49..0d7b71aff28f 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -614,7 +614,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
> rq->ring = ce->ring;
> rq->timeline = ce->ring->timeline;
> GEM_BUG_ON(rq->timeline == &engine->timeline);
> - rq->hwsp_seqno = &engine->status_page.addr[I915_GEM_HWS_INDEX];
> + rq->hwsp_seqno = rq->timeline->hwsp_seqno;
>
> spin_lock_init(&rq->lock);
> dma_fence_init(&rq->fence,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 4dd22dadf5ce..a16a3b7f7d92 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -324,32 +324,21 @@ static inline u32 hwsp_seqno(const struct i915_request *rq)
> */
> static inline bool i915_request_started(const struct i915_request *rq)
> {
> - u32 seqno;
> -
> - seqno = i915_request_global_seqno(rq);
> - if (!seqno) /* not yet submitted to HW */
> - return false;
> -
> - return i915_seqno_passed(hwsp_seqno(rq), seqno - 1);
> + return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
> }
>
> -static inline bool
> -__i915_request_completed(const struct i915_request *rq, u32 seqno)
> +static inline bool i915_request_completed(const struct i915_request *rq)
> {
> - GEM_BUG_ON(!seqno);
> - return i915_seqno_passed(hwsp_seqno(rq), seqno) &&
> - seqno == i915_request_global_seqno(rq);
> + return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno);
> }
>
> -static inline bool i915_request_completed(const struct i915_request *rq)
> +static inline void i915_request_fake_complete(const struct i915_request *rq)
I don't like this name. force_complete? force_hwsp_complete? Or turn it
around, i915_hwsp_write(rq->hwsp_seqno, rq->fence.seqno)? Now it is
beginning to remind of intel_write_status_page. :)
> {
> - u32 seqno;
> -
> - seqno = i915_request_global_seqno(rq);
> - if (!seqno)
> - return false;
> + /* Don't allow ourselves to accidentally go backwards. */
> + if (i915_request_completed(rq))
> + return;
>
> - return __i915_request_completed(rq, seqno);
> + WRITE_ONCE(*(u32 *)rq->hwsp_seqno, rq->fence.seqno);
> }
>
> void i915_retire_requests(struct drm_i915_private *i915);
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 12e5a2bc825c..eff76558b958 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -756,6 +756,7 @@ static void nop_submit_request(struct i915_request *request)
>
> spin_lock_irqsave(&request->engine->timeline.lock, flags);
> __i915_request_submit(request);
> + i915_request_fake_complete(request);
> intel_engine_write_global_seqno(request->engine, request->global_seqno);
> spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
> }
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 5793abe509a2..18be786a970d 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -221,6 +221,13 @@ static inline u32 i915_ggtt_offset(const struct i915_vma *vma)
> return lower_32_bits(vma->node.start);
> }
>
> +/* XXX inline spaghetti */
> +static inline u32 i915_timeline_seqno_address(const struct i915_timeline *tl)
> +{
> + GEM_BUG_ON(!tl->pin_count);
> + return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset;
> +}
> +
> static inline u32 i915_ggtt_pin_bias(struct i915_vma *vma)
> {
> return i915_vm_to_ggtt(vma->vm)->pin_bias;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index a624e644fbd7..593928dd6bbe 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -827,10 +827,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
> list_for_each_entry(rq, &engine->timeline.requests, link) {
> GEM_BUG_ON(!rq->global_seqno);
>
> - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
> - continue;
> + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
> + dma_fence_set_error(&rq->fence, -EIO);
>
> - dma_fence_set_error(&rq->fence, -EIO);
> + i915_request_fake_complete(rq);
> }
>
> /* Flush the queued requests to the timeline list (for retiring). */
> @@ -843,6 +843,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>
> dma_fence_set_error(&rq->fence, -EIO);
> __i915_request_submit(rq);
> + i915_request_fake_complete(rq);
> }
>
> rb_erase_cached(&p->node, &execlists->queue);
> @@ -2022,31 +2023,40 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
> /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
> BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
>
> - cs = gen8_emit_ggtt_write(cs, request->global_seqno,
> + cs = gen8_emit_ggtt_write(cs,
> + request->fence.seqno,
> + i915_timeline_seqno_address(request->timeline));
> +
> + cs = gen8_emit_ggtt_write(cs,
> + request->global_seqno,
> intel_hws_seqno_address(request->engine));
> +
> *cs++ = MI_USER_INTERRUPT;
> *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +
> request->tail = intel_ring_offset(request, cs);
> assert_ring_tail_valid(request->ring, request->tail);
>
> gen8_emit_wa_tail(request, cs);
> }
> -static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
> +static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
>
> static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
> {
> - /* We're using qword write, seqno should be aligned to 8 bytes. */
> - BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
> -
> cs = gen8_emit_ggtt_write_rcs(cs,
> - request->global_seqno,
> - intel_hws_seqno_address(request->engine),
> + request->fence.seqno,
> + i915_timeline_seqno_address(request->timeline),
> PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
> PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> PIPE_CONTROL_DC_FLUSH_ENABLE |
> PIPE_CONTROL_FLUSH_ENABLE |
> PIPE_CONTROL_CS_STALL);
>
> + cs = gen8_emit_ggtt_write_rcs(cs,
> + request->global_seqno,
> + intel_hws_seqno_address(request->engine),
> + PIPE_CONTROL_CS_STALL);
> +
> *cs++ = MI_USER_INTERRUPT;
> *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>
> @@ -2055,7 +2065,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
>
> gen8_emit_wa_tail(request, cs);
> }
> -static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
> +static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
>
> static int gen8_init_rcs_context(struct i915_request *rq)
> {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 5887304bc3ae..bcc700e7037b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -325,6 +325,12 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> PIPE_CONTROL_DC_FLUSH_ENABLE |
> PIPE_CONTROL_QW_WRITE |
> PIPE_CONTROL_CS_STALL);
> + *cs++ = i915_timeline_seqno_address(rq->timeline) |
> + PIPE_CONTROL_GLOBAL_GTT;
> + *cs++ = rq->fence.seqno;
> +
> + *cs++ = GFX_OP_PIPE_CONTROL(4);
> + *cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
> *cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
> *cs++ = rq->global_seqno;
>
> @@ -334,7 +340,7 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> rq->tail = intel_ring_offset(rq, cs);
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
> -static const int gen6_rcs_emit_breadcrumb_sz = 14;
> +static const int gen6_rcs_emit_breadcrumb_sz = 18;
>
> static int
> gen7_render_ring_cs_stall_wa(struct i915_request *rq)
> @@ -425,6 +431,13 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> PIPE_CONTROL_QW_WRITE |
> PIPE_CONTROL_GLOBAL_GTT_IVB |
> PIPE_CONTROL_CS_STALL);
> + *cs++ = i915_timeline_seqno_address(rq->timeline);
> + *cs++ = rq->fence.seqno;
> +
> + *cs++ = GFX_OP_PIPE_CONTROL(4);
> + *cs++ = (PIPE_CONTROL_QW_WRITE |
> + PIPE_CONTROL_GLOBAL_GTT_IVB |
> + PIPE_CONTROL_CS_STALL);
> *cs++ = intel_hws_seqno_address(rq->engine);
> *cs++ = rq->global_seqno;
>
> @@ -434,27 +447,37 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> rq->tail = intel_ring_offset(rq, cs);
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
> -static const int gen7_rcs_emit_breadcrumb_sz = 6;
> +static const int gen7_rcs_emit_breadcrumb_sz = 10;
>
> static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> {
> - *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
> - *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
> + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
> + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
> + *cs++ = rq->fence.seqno;
> +
> + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
> + *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
> *cs++ = rq->global_seqno;
> +
> *cs++ = MI_USER_INTERRUPT;
> + *cs++ = MI_NOOP;
>
> rq->tail = intel_ring_offset(rq, cs);
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
> -static const int gen6_xcs_emit_breadcrumb_sz = 4;
> +static const int gen6_xcs_emit_breadcrumb_sz = 8;
>
> #define GEN7_XCS_WA 32
> static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> {
> int i;
>
> - *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
> - *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
> + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
> + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
> + *cs++ = rq->fence.seqno;
> +
> + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
> + *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
> *cs++ = rq->global_seqno;
>
> for (i = 0; i < GEN7_XCS_WA; i++) {
> @@ -468,12 +491,11 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> *cs++ = 0;
>
> *cs++ = MI_USER_INTERRUPT;
> - *cs++ = MI_NOOP;
>
> rq->tail = intel_ring_offset(rq, cs);
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
> -static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3;
> +static const int gen7_xcs_emit_breadcrumb_sz = 10 + GEN7_XCS_WA * 3;
> #undef GEN7_XCS_WA
>
> static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
> @@ -733,7 +755,7 @@ static void reset_ring(struct intel_engine_cs *engine, bool stalled)
> rq = NULL;
> spin_lock_irqsave(&tl->lock, flags);
> list_for_each_entry(pos, &tl->requests, link) {
> - if (!__i915_request_completed(pos, pos->global_seqno)) {
> + if (!i915_request_completed(pos)) {
> rq = pos;
> break;
> }
> @@ -875,11 +897,10 @@ static void cancel_requests(struct intel_engine_cs *engine)
> list_for_each_entry(request, &engine->timeline.requests, link) {
> GEM_BUG_ON(!request->global_seqno);
>
> - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> - &request->fence.flags))
> - continue;
> -
> - dma_fence_set_error(&request->fence, -EIO);
> + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
> + &request->fence.flags))
> + dma_fence_set_error(&request->fence, -EIO);
> + i915_request_fake_complete(request);
> }
>
> intel_write_status_page(engine,
> @@ -903,27 +924,38 @@ static void i9xx_submit_request(struct i915_request *request)
>
> static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> {
> + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
> +
> *cs++ = MI_FLUSH;
>
> + *cs++ = MI_STORE_DWORD_INDEX;
> + *cs++ = I915_GEM_HWS_SEQNO_ADDR;
> + *cs++ = rq->fence.seqno;
> +
> *cs++ = MI_STORE_DWORD_INDEX;
> *cs++ = I915_GEM_HWS_INDEX_ADDR;
> *cs++ = rq->global_seqno;
>
> *cs++ = MI_USER_INTERRUPT;
> - *cs++ = MI_NOOP;
>
> rq->tail = intel_ring_offset(rq, cs);
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
> -static const int i9xx_emit_breadcrumb_sz = 6;
> +static const int i9xx_emit_breadcrumb_sz = 8;
>
> #define GEN5_WA_STORES 8 /* must be at least 1! */
> static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> {
> int i;
>
> + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
> +
> *cs++ = MI_FLUSH;
>
> + *cs++ = MI_STORE_DWORD_INDEX;
> + *cs++ = I915_GEM_HWS_SEQNO_ADDR;
> + *cs++ = rq->fence.seqno;
> +
> BUILD_BUG_ON(GEN5_WA_STORES < 1);
> for (i = 0; i < GEN5_WA_STORES; i++) {
> *cs++ = MI_STORE_DWORD_INDEX;
> @@ -932,11 +964,12 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
> }
>
> *cs++ = MI_USER_INTERRUPT;
> + *cs++ = MI_NOOP;
>
> rq->tail = intel_ring_offset(rq, cs);
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
> -static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 2;
> +static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 6;
> #undef GEN5_WA_STORES
>
> static void
> @@ -1163,6 +1196,10 @@ int intel_ring_pin(struct intel_ring *ring)
>
> GEM_BUG_ON(ring->vaddr);
>
> + ret = i915_timeline_pin(ring->timeline);
> + if (ret)
> + return ret;
> +
> flags = PIN_GLOBAL;
>
> /* Ring wraparound at offset 0 sometimes hangs. No idea why. */
> @@ -1179,28 +1216,32 @@ int intel_ring_pin(struct intel_ring *ring)
> else
> ret = i915_gem_object_set_to_cpu_domain(vma->obj, true);
> if (unlikely(ret))
> - return ret;
> + goto unpin_timeline;
> }
>
> ret = i915_vma_pin(vma, 0, 0, flags);
> if (unlikely(ret))
> - return ret;
> + goto unpin_timeline;
>
> if (i915_vma_is_map_and_fenceable(vma))
> addr = (void __force *)i915_vma_pin_iomap(vma);
> else
> addr = i915_gem_object_pin_map(vma->obj, map);
> - if (IS_ERR(addr))
> - goto err;
> + if (IS_ERR(addr)) {
> + ret = PTR_ERR(addr);
> + goto unpin_ring;
> + }
>
> vma->obj->pin_global++;
>
> ring->vaddr = addr;
> return 0;
>
> -err:
> +unpin_ring:
> i915_vma_unpin(vma);
> - return PTR_ERR(addr);
> +unpin_timeline:
> + i915_timeline_unpin(ring->timeline);
> + return ret;
> }
>
> void intel_ring_reset(struct intel_ring *ring, u32 tail)
> @@ -1229,6 +1270,8 @@ void intel_ring_unpin(struct intel_ring *ring)
>
> ring->vma->obj->pin_global--;
> i915_vma_unpin(ring->vma);
> +
> + i915_timeline_unpin(ring->timeline);
> }
>
> static struct i915_vma *
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index acd27c7e807b..b4b61056b227 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -79,6 +79,7 @@ static void advance(struct mock_engine *engine,
> struct mock_request *request)
> {
> list_del_init(&request->link);
> + i915_request_fake_complete(&request->base);
> mock_seqno_advance(&engine->base, request->base.global_seqno);
> }
>
> @@ -253,16 +254,13 @@ void mock_engine_flush(struct intel_engine_cs *engine)
> del_timer_sync(&mock->hw_delay);
>
> spin_lock_irq(&mock->hw_lock);
> - list_for_each_entry_safe(request, rn, &mock->hw_queue, link) {
> - list_del_init(&request->link);
> - mock_seqno_advance(&mock->base, request->base.global_seqno);
> - }
> + list_for_each_entry_safe(request, rn, &mock->hw_queue, link)
> + advance(mock, request);
> spin_unlock_irq(&mock->hw_lock);
> }
>
> void mock_engine_reset(struct intel_engine_cs *engine)
> {
> - intel_write_status_page(engine, I915_GEM_HWS_INDEX, 0);
> }
>
> void mock_engine_free(struct intel_engine_cs *engine)
>
Looks good.
Regards,
Tvrtko
More information about the Intel-gfx
mailing list