[Intel-gfx] [PATCH 14/18] drm/i915/guc: Prepare for nonblocking execbuf submission

Fri Sep 2 18:20:52 UTC 2016

On 30/08/16 09:18, Chris Wilson wrote:
> Currently the presumption is that the request construction and its
> submission to the GuC are all under the same holding of struct_mutex. We
> wish to relax this to separate the request construction and the later
> submission to the GuC. This requires us to reserve some space in the
> GuC command queue for the future submission. For flexibility to handle
> out-of-order request submission we do not preallocate the next slot in
> the GuC command queue during request construction, just ensuring that
> there is enough space later.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 55 ++++++++++++++----------------
>  drivers/gpu/drm/i915/intel_guc.h           |  3 ++
>  2 files changed, 29 insertions(+), 29 deletions(-)

Hmm .. the functional changes look mostly OK, apart from some locking, 
but there seems to be a great deal of unnecessary churn, such as 
combining statements which had been kept separate for clarity :(

> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 2332f9c98bdd..a047e61adc2a 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -434,20 +434,23 @@ int i915_guc_wq_check_space(struct drm_i915_gem_request *request)
>  {
>  	const size_t wqi_size = sizeof(struct guc_wq_item);
>  	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
> -	struct guc_process_desc *desc;
> +	struct guc_process_desc *desc = gc->client_base + gc->proc_desc_offset;
>  	u32 freespace;
> +	int ret;
>
> -	GEM_BUG_ON(gc == NULL);
> -
> -	desc = gc->client_base + gc->proc_desc_offset;
> -
> +	spin_lock(&gc->lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> -	if (likely(freespace >= wqi_size))
> -		return 0;
> -
> -	gc->no_wq_space += 1;
> +	freespace -= gc->wq_rsvd;
> +	if (likely(freespace >= wqi_size)) {
> +		gc->wq_rsvd += wqi_size;
> +		ret = 0;
> +	} else {
> +		gc->no_wq_space++;
> +		ret = -EAGAIN;
> +	}
> +	spin_unlock(&gc->lock);
>
> -	return -EAGAIN;
> +	return ret;
>  }
>
>  static void guc_add_workqueue_item(struct i915_guc_client *gc,
> @@ -457,22 +460,9 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
>  	const size_t wqi_size = sizeof(struct guc_wq_item);
>  	const u32 wqi_len = wqi_size/sizeof(u32) - 1;
>  	struct intel_engine_cs *engine = rq->engine;
> -	struct guc_process_desc *desc;
>  	struct guc_wq_item *wqi;
>  	void *base;
> -	u32 freespace, tail, wq_off, wq_page;
> -
> -	desc = gc->client_base + gc->proc_desc_offset;
> -
> -	/* Free space is guaranteed, see i915_guc_wq_check_space() above */

This comment seems to have been lost. It still applies (mutatis 
mutandis), so it should be relocated to some part of the new version ...

> -	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> -	GEM_BUG_ON(freespace < wqi_size);
> -
> -	/* The GuC firmware wants the tail index in QWords, not bytes */
> -	tail = rq->tail;
> -	GEM_BUG_ON(tail & 7);
> -	tail >>= 3;
> -	GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);

This *commented* sequence of statements seems clearer than the 
replacement below ...

> +	u32 wq_off, wq_page;
>
>  	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
>  	 * should not have the case where structure wqi is across page, neither
> @@ -482,18 +472,19 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
>  	 * workqueue buffer dw by dw.
>  	 */
>  	BUILD_BUG_ON(wqi_size != 16);

This would be a good place to note that:

/* Reserved space is guaranteed, see i915_guc_wq_check_space() above */

> +	GEM_BUG_ON(gc->wq_rsvd < wqi_size);
>
>  	/* postincrement WQ tail for next time */
>  	wq_off = gc->wq_tail;
> +	GEM_BUG_ON(wq_off & (wqi_size - 1));
>  	gc->wq_tail += wqi_size;
>  	gc->wq_tail &= gc->wq_size - 1;
> -	GEM_BUG_ON(wq_off & (wqi_size - 1));
> +	gc->wq_rsvd -= wqi_size;
>
>  	/* WQ starts from the page after doorbell / process_desc */
>  	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
> -	wq_off &= PAGE_SIZE - 1;
>  	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
> -	wqi = (struct guc_wq_item *)((char *)base + wq_off);
> +	wqi = (struct guc_wq_item *)((char *)base + offset_in_page(wq_off));
>
>  	/* Now fill in the 4-word work queue item */
>  	wqi->header = WQ_TYPE_INORDER |
> @@ -504,7 +495,7 @@ static void guc_add_workqueue_item(struct i915_guc_client *gc,
>  	/* The GuC wants only the low-order word of the context descriptor */
>  	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
>
> -	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
> +	wqi->ring_tail = rq->tail >> 3 << WQ_RING_TAIL_SHIFT;

This line is particularly ugly. I think it's the >> chevron << effect.
Parenthesisation would help, but it would be nicer as separate lines.
Also, there's no explanation of the "3" here, unlike the original 
version above.

>  	wqi->fence_id = rq->fence.seqno;
>
>  	kunmap_atomic(base);
> @@ -591,8 +582,10 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	spin_lock(&client->lock);
>  	guc_add_workqueue_item(client, rq);
>  	b_ret = guc_ring_doorbell(client);
> +	spin_unlock(&client->lock);
>
>  	client->submissions[engine_id] += 1;

Outside the spinlock? Do we still have the BKL during submit(), just not 
during i915_guc_wq_check_space() ? If so, then guc_ring_doorbell() 
doesn't strictly need to be inside the spinlock (or the lock could be 
inside guc_add_workqueue_item()); but if not then the update of 
submissions[] should be inside.

>  	client->retcode = b_ret;
> @@ -770,6 +763,8 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
>  	if (!client)
>  		return NULL;
>
> +	spin_lock_init(&client->lock);
> +
>  	client->owner = ctx;
>  	client->guc = guc;
>  	client->engines = engines;
> @@ -1019,9 +1014,11 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  		engine->submit_request = i915_guc_submit;
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request, &engine->request_list, link)
> +		list_for_each_entry(request, &engine->request_list, link) {
> +			client->wq_rsvd += sizeof(struct guc_wq_item);

Presumably this is being called in some context that ensures that we 
don't need to hold the spinlock while updating wq_rsvd ? Maybe that 
should be mentioned ...

>  			if (i915_sw_fence_done(&request->submit))
>  				i915_guc_submit(request);
> +		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index c97326269588..27a622824b54 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -68,6 +68,8 @@ struct i915_guc_client {
>  	struct i915_gem_context *owner;
>  	struct intel_guc *guc;
>
> +	spinlock_t lock;

It would be helpful if this lock were annotated with a list of things it 
protects. AFAICT it might be:
	wq_tail
	wq_rsvd
	no_wq_space
	cookie

b_fail and the submissions[] statistics seem to have been left 
unprotected :( though of course they're not really critical

.Dave.

> +
>  	uint32_t engines;		/* bitmap of (host) engine ids	*/
>  	uint32_t priority;
>  	uint32_t ctx_index;
> @@ -81,6 +83,7 @@ struct i915_guc_client {
>  	uint32_t wq_offset;
>  	uint32_t wq_size;
>  	uint32_t wq_tail;
> +	uint32_t wq_rsvd;
>  	uint32_t no_wq_space;
>  	uint32_t b_fail;
>  	int retcode;