[Intel-gfx] [PATCH 23/33] drm/i915: Track the context's seqno in its own timeline HWSP
Chris Wilson
chris at chris-wilson.co.uk
Fri Jan 25 02:29:55 UTC 2019
Now that we have allocated ourselves a cacheline to store a breadcrumb,
we can emit a write from the GPU into the timeline's HWSP of the
per-context seqno as we complete each request. This drops the mirroring
of the per-engine HWSP and allows each context to operate independently.
We do not need to unwind the per-context timeline, and so requests are
always consistent with the timeline breadcrumb, greatly simplifying the
completion checks as we no longer need to be concerned about the
global_seqno changing mid check.
One complication though is that we have to be wary that the request may
outlive the HWSP and so avoid touching the potentially danging pointer
after we have retired the fence. We also have to guard our access of the
HWSP with RCU, the release of the obj->mm.pages should already be RCU-safe.
At this point, we are emitting both per-context and global seqno and
still using the single per-engine execution timeline for resolving
interrupts.
v2: s/fake_complete/mark_complete/
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
drivers/gpu/drm/i915/i915_gem.c | 2 +-
drivers/gpu/drm/i915/i915_request.c | 3 +-
drivers/gpu/drm/i915/i915_request.h | 30 +++-----
drivers/gpu/drm/i915/i915_reset.c | 1 +
drivers/gpu/drm/i915/i915_vma.h | 6 ++
drivers/gpu/drm/i915/intel_engine_cs.c | 15 +++-
drivers/gpu/drm/i915/intel_lrc.c | 31 +++++---
drivers/gpu/drm/i915/intel_ringbuffer.c | 76 +++++++++++++++-----
drivers/gpu/drm/i915/selftests/mock_engine.c | 19 ++++-
9 files changed, 131 insertions(+), 52 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d68f3fdd8a8e..1bd724d663d9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2892,7 +2892,7 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
*/
spin_lock_irqsave(&engine->timeline.lock, flags);
list_for_each_entry(request, &engine->timeline.requests, link) {
- if (__i915_request_completed(request, request->global_seqno))
+ if (i915_request_completed(request))
continue;
active = request;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 79eb1957cf99..cdbdbcff28ec 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -199,6 +199,7 @@ static void __retire_engine_request(struct intel_engine_cs *engine,
spin_unlock(&engine->timeline.lock);
spin_lock(&rq->lock);
+ i915_request_mark_complete(rq);
if (!i915_request_signaled(rq))
dma_fence_signal_locked(&rq->fence);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags))
@@ -634,7 +635,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
rq->ring = ce->ring;
rq->timeline = ce->ring->timeline;
GEM_BUG_ON(rq->timeline == &engine->timeline);
- rq->hwsp_seqno = &engine->status_page.addr[I915_GEM_HWS_INDEX];
+ rq->hwsp_seqno = rq->timeline->hwsp_seqno;
spin_lock_init(&rq->lock);
dma_fence_init(&rq->fence,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index ade010fe6e26..96c586d6ff4d 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -289,6 +289,7 @@ long i915_request_wait(struct i915_request *rq,
static inline bool i915_request_signaled(const struct i915_request *rq)
{
+ /* The request may live longer than its HWSP, so check flags first! */
return test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags);
}
@@ -340,32 +341,23 @@ static inline u32 hwsp_seqno(const struct i915_request *rq)
*/
static inline bool i915_request_started(const struct i915_request *rq)
{
- u32 seqno;
-
- seqno = i915_request_global_seqno(rq);
- if (!seqno) /* not yet submitted to HW */
- return false;
+ if (i915_request_signaled(rq))
+ return true;
- return i915_seqno_passed(hwsp_seqno(rq), seqno - 1);
-}
-
-static inline bool
-__i915_request_completed(const struct i915_request *rq, u32 seqno)
-{
- GEM_BUG_ON(!seqno);
- return i915_seqno_passed(hwsp_seqno(rq), seqno) &&
- seqno == i915_request_global_seqno(rq);
+ return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
}
static inline bool i915_request_completed(const struct i915_request *rq)
{
- u32 seqno;
+ if (i915_request_signaled(rq))
+ return true;
- seqno = i915_request_global_seqno(rq);
- if (!seqno)
- return false;
+ return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno);
+}
- return __i915_request_completed(rq, seqno);
+static inline void i915_request_mark_complete(struct i915_request *rq)
+{
+ rq->hwsp_seqno = (u32 *)&rq->fence.seqno; /* decouple from HWSP */
}
void i915_retire_requests(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index c57b76aebd8f..e88691da60c0 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -760,6 +760,7 @@ static void nop_submit_request(struct i915_request *request)
spin_lock_irqsave(&request->engine->timeline.lock, flags);
__i915_request_submit(request);
+ i915_request_mark_complete(request);
intel_engine_write_global_seqno(request->engine, request->global_seqno);
spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
}
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 46eb818ed309..b0f6b1d904a5 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -227,6 +227,12 @@ static inline u32 i915_ggtt_pin_bias(struct i915_vma *vma)
}
/* XXX inline spaghetti */
+static inline u32 i915_timeline_seqno_address(const struct i915_timeline *tl)
+{
+ GEM_BUG_ON(!tl->pin_count);
+ return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset;
+}
+
static inline struct i915_timeline_hwsp *
i915_timeline_hwsp(const struct i915_timeline *tl)
{
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 28a66242d698..8ca062815ff9 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -660,10 +660,16 @@ static int measure_breadcrumb_dw(struct intel_engine_cs *engine)
frame->rq.ring = &frame->ring;
frame->rq.timeline = &frame->timeline;
+ dw = i915_timeline_pin(&frame->timeline);
+ if (dw < 0)
+ goto out_timeline;
+
dw = engine->emit_breadcrumb(&frame->rq, frame->cs) - frame->cs;
- i915_timeline_fini(&frame->timeline);
+ i915_timeline_unpin(&frame->timeline);
+out_timeline:
+ i915_timeline_fini(&frame->timeline);
out_frame:
kfree(frame);
return dw;
@@ -1426,9 +1432,10 @@ static void intel_engine_print_registers(const struct intel_engine_cs *engine,
char hdr[80];
snprintf(hdr, sizeof(hdr),
- "\t\tELSP[%d] count=%d, ring->start=%08x, rq: ",
+ "\t\tELSP[%d] count=%d, ring: {start:%08x, hwsp:%08x}, rq: ",
idx, count,
- i915_ggtt_offset(rq->ring->vma));
+ i915_ggtt_offset(rq->ring->vma),
+ i915_timeline_seqno_address(rq->timeline));
print_request(m, rq, hdr);
} else {
drm_printf(m, "\t\tELSP[%d] idle\n", idx);
@@ -1538,6 +1545,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
rq->ring->emit);
drm_printf(m, "\t\tring->space: 0x%08x\n",
rq->ring->space);
+ drm_printf(m, "\t\tring->hwsp: 0x%08x\n",
+ i915_timeline_seqno_address(rq->timeline));
print_request_ring(m, rq);
}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f4c52b9bdfca..a75918d50788 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -938,10 +938,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
list_for_each_entry(rq, &engine->timeline.requests, link) {
GEM_BUG_ON(!rq->global_seqno);
- if (i915_request_signaled(rq))
- continue;
+ if (!i915_request_signaled(rq))
+ dma_fence_set_error(&rq->fence, -EIO);
- dma_fence_set_error(&rq->fence, -EIO);
+ i915_request_mark_complete(rq);
}
/* Flush the queued requests to the timeline list (for retiring). */
@@ -951,9 +951,9 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
priolist_for_each_request_consume(rq, rn, p, i) {
list_del_init(&rq->sched.link);
-
- dma_fence_set_error(&rq->fence, -EIO);
__i915_request_submit(rq);
+ dma_fence_set_error(&rq->fence, -EIO);
+ i915_request_mark_complete(rq);
}
rb_erase_cached(&p->node, &execlists->queue);
@@ -2152,10 +2152,17 @@ static u32 *gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
- cs = gen8_emit_ggtt_write(cs, request->global_seqno,
+ cs = gen8_emit_ggtt_write(cs,
+ request->fence.seqno,
+ i915_timeline_seqno_address(request->timeline));
+
+ cs = gen8_emit_ggtt_write(cs,
+ request->global_seqno,
intel_hws_seqno_address(request->engine));
+
*cs++ = MI_USER_INTERRUPT;
*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
request->tail = intel_ring_offset(request, cs);
assert_ring_tail_valid(request->ring, request->tail);
@@ -2164,18 +2171,20 @@ static u32 *gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
static u32 *gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
{
- /* We're using qword write, seqno should be aligned to 8 bytes. */
- BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
-
cs = gen8_emit_ggtt_write_rcs(cs,
- request->global_seqno,
- intel_hws_seqno_address(request->engine),
+ request->fence.seqno,
+ i915_timeline_seqno_address(request->timeline),
PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_DC_FLUSH_ENABLE |
PIPE_CONTROL_FLUSH_ENABLE |
PIPE_CONTROL_CS_STALL);
+ cs = gen8_emit_ggtt_write_rcs(cs,
+ request->global_seqno,
+ intel_hws_seqno_address(request->engine),
+ PIPE_CONTROL_CS_STALL);
+
*cs++ = MI_USER_INTERRUPT;
*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 174795622eb1..71a51a129c98 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -326,6 +326,12 @@ static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
PIPE_CONTROL_DC_FLUSH_ENABLE |
PIPE_CONTROL_QW_WRITE |
PIPE_CONTROL_CS_STALL);
+ *cs++ = i915_timeline_seqno_address(rq->timeline) |
+ PIPE_CONTROL_GLOBAL_GTT;
+ *cs++ = rq->fence.seqno;
+
+ *cs++ = GFX_OP_PIPE_CONTROL(4);
+ *cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
*cs++ = rq->global_seqno;
@@ -427,6 +433,13 @@ static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
PIPE_CONTROL_QW_WRITE |
PIPE_CONTROL_GLOBAL_GTT_IVB |
PIPE_CONTROL_CS_STALL);
+ *cs++ = i915_timeline_seqno_address(rq->timeline);
+ *cs++ = rq->fence.seqno;
+
+ *cs++ = GFX_OP_PIPE_CONTROL(4);
+ *cs++ = (PIPE_CONTROL_QW_WRITE |
+ PIPE_CONTROL_GLOBAL_GTT_IVB |
+ PIPE_CONTROL_CS_STALL);
*cs++ = intel_hws_seqno_address(rq->engine);
*cs++ = rq->global_seqno;
@@ -441,10 +454,16 @@ static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
{
- *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
- *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
+ *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+ *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+ *cs++ = rq->fence.seqno;
+
+ *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+ *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
*cs++ = rq->global_seqno;
+
*cs++ = MI_USER_INTERRUPT;
+ *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -457,8 +476,12 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
{
int i;
- *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
- *cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
+ *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+ *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+ *cs++ = rq->fence.seqno;
+
+ *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+ *cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
*cs++ = rq->global_seqno;
for (i = 0; i < GEN7_XCS_WA; i++) {
@@ -472,7 +495,6 @@ static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
*cs++ = 0;
*cs++ = MI_USER_INTERRUPT;
- *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -738,7 +760,7 @@ static void reset_ring(struct intel_engine_cs *engine, bool stalled)
rq = NULL;
spin_lock_irqsave(&tl->lock, flags);
list_for_each_entry(pos, &tl->requests, link) {
- if (!__i915_request_completed(pos, pos->global_seqno)) {
+ if (!i915_request_completed(pos)) {
rq = pos;
break;
}
@@ -880,10 +902,10 @@ static void cancel_requests(struct intel_engine_cs *engine)
list_for_each_entry(request, &engine->timeline.requests, link) {
GEM_BUG_ON(!request->global_seqno);
- if (i915_request_signaled(request))
- continue;
+ if (!i915_request_signaled(request))
+ dma_fence_set_error(&request->fence, -EIO);
- dma_fence_set_error(&request->fence, -EIO);
+ i915_request_mark_complete(request);
}
intel_write_status_page(engine,
@@ -907,14 +929,19 @@ static void i9xx_submit_request(struct i915_request *request)
static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
{
+ GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
+
*cs++ = MI_FLUSH;
+ *cs++ = MI_STORE_DWORD_INDEX;
+ *cs++ = I915_GEM_HWS_SEQNO_ADDR;
+ *cs++ = rq->fence.seqno;
+
*cs++ = MI_STORE_DWORD_INDEX;
*cs++ = I915_GEM_HWS_INDEX_ADDR;
*cs++ = rq->global_seqno;
*cs++ = MI_USER_INTERRUPT;
- *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -927,8 +954,14 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
{
int i;
+ GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
+
*cs++ = MI_FLUSH;
+ *cs++ = MI_STORE_DWORD_INDEX;
+ *cs++ = I915_GEM_HWS_SEQNO_ADDR;
+ *cs++ = rq->fence.seqno;
+
BUILD_BUG_ON(GEN5_WA_STORES < 1);
for (i = 0; i < GEN5_WA_STORES; i++) {
*cs++ = MI_STORE_DWORD_INDEX;
@@ -937,6 +970,7 @@ static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
}
*cs++ = MI_USER_INTERRUPT;
+ *cs++ = MI_NOOP;
rq->tail = intel_ring_offset(rq, cs);
assert_ring_tail_valid(rq->ring, rq->tail);
@@ -1169,6 +1203,10 @@ int intel_ring_pin(struct intel_ring *ring)
GEM_BUG_ON(ring->vaddr);
+ ret = i915_timeline_pin(ring->timeline);
+ if (ret)
+ return ret;
+
flags = PIN_GLOBAL;
/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
@@ -1185,28 +1223,32 @@ int intel_ring_pin(struct intel_ring *ring)
else
ret = i915_gem_object_set_to_cpu_domain(vma->obj, true);
if (unlikely(ret))
- return ret;
+ goto unpin_timeline;
}
ret = i915_vma_pin(vma, 0, 0, flags);
if (unlikely(ret))
- return ret;
+ goto unpin_timeline;
if (i915_vma_is_map_and_fenceable(vma))
addr = (void __force *)i915_vma_pin_iomap(vma);
else
addr = i915_gem_object_pin_map(vma->obj, map);
- if (IS_ERR(addr))
- goto err;
+ if (IS_ERR(addr)) {
+ ret = PTR_ERR(addr);
+ goto unpin_ring;
+ }
vma->obj->pin_global++;
ring->vaddr = addr;
return 0;
-err:
+unpin_ring:
i915_vma_unpin(vma);
- return PTR_ERR(addr);
+unpin_timeline:
+ i915_timeline_unpin(ring->timeline);
+ return ret;
}
void intel_ring_reset(struct intel_ring *ring, u32 tail)
@@ -1235,6 +1277,8 @@ void intel_ring_unpin(struct intel_ring *ring)
ring->vma->obj->pin_global--;
i915_vma_unpin(ring->vma);
+
+ i915_timeline_unpin(ring->timeline);
}
static struct i915_vma *
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 919c89fd6ee5..95e890d7f58b 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -30,6 +30,17 @@ struct mock_ring {
struct i915_timeline timeline;
};
+static void mock_timeline_pin(struct i915_timeline *tl)
+{
+ tl->pin_count++;
+}
+
+static void mock_timeline_unpin(struct i915_timeline *tl)
+{
+ GEM_BUG_ON(!tl->pin_count);
+ tl->pin_count--;
+}
+
static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
{
const unsigned long sz = PAGE_SIZE / 2;
@@ -76,6 +87,8 @@ static void advance(struct mock_request *request)
{
list_del_init(&request->link);
mock_seqno_advance(request->base.engine, request->base.global_seqno);
+ i915_request_mark_complete(&request->base);
+ GEM_BUG_ON(!i915_request_completed(&request->base));
}
static void hw_delay_complete(struct timer_list *t)
@@ -108,6 +121,7 @@ static void hw_delay_complete(struct timer_list *t)
static void mock_context_unpin(struct intel_context *ce)
{
+ mock_timeline_unpin(ce->ring->timeline);
i915_gem_context_put(ce->gem_context);
}
@@ -129,6 +143,7 @@ mock_context_pin(struct intel_engine_cs *engine,
struct i915_gem_context *ctx)
{
struct intel_context *ce = to_intel_context(ctx, engine);
+ int err = -ENOMEM;
if (ce->pin_count++)
return ce;
@@ -139,13 +154,15 @@ mock_context_pin(struct intel_engine_cs *engine,
goto err;
}
+ mock_timeline_pin(ce->ring->timeline);
+
ce->ops = &mock_context_ops;
i915_gem_context_get(ctx);
return ce;
err:
ce->pin_count = 0;
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}
static int mock_request_alloc(struct i915_request *request)
--
2.20.1
More information about the Intel-gfx
mailing list