[Intel-gfx] [CI 7/9] drm/i915/guc: Submit GuC workitems containing coalesced requests

Mon Sep 18 15:05:27 UTC 2017

Minor change needed: With removal of i915_guc_wq_reserve and void 
i915_guc_wq_unreserve, declaration "struct drm_i915_gem_request" can be 
removed from intel_uc.h

On 9/18/2017 3:34 PM, Chris Wilson wrote:
> From: Michał Winiarski <michal.winiarski at intel.com>
>
> To create an upper bound on number of GuC workitems, we need to change
> the way that requests are being submitted. Rather than submitting each
> request as an individual workitem, we can do coalescing in a similar way
> we're handlig execlist submission ports. We also need to stop pretending
> that we're doing "lite-restore" in GuC submission (we would create a
> workitem each time we hit this condition). This allows us to completely
> remove the reservation, replacing it with a compile time check.
>
> v2: Also coalesce when replaying on reset (Daniele)
> v3: Consistent wq_resv - per-request (Daniele)
> v4: Squash removing wq_resv
> v5: Reflect i915_guc_submit argument changes in doc
> v6: Rebase on top of execlists reset/restart fix (Chris)
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=101873
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Jeff McGee <jeff.mcgee at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Oscar Mateo <oscar.mateo at intel.com>
> Signed-off-by: Michał Winiarski <michal.winiarski at intel.com>
> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
> Link: https://patchwork.freedesktop.org/patch/msgid/20170914083216.10192-2-michal.winiarski@intel.com
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c        |   2 -
>   drivers/gpu/drm/i915/i915_guc_submission.c | 181 ++++++++++-------------------
>   drivers/gpu/drm/i915/intel_lrc.c           |  25 +---
>   drivers/gpu/drm/i915/intel_uc.h            |  11 --
>   4 files changed, 63 insertions(+), 156 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 62133dd303ac..0364f0d2d76e 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2451,8 +2451,6 @@ static void i915_guc_client_info(struct seq_file *m,
>   	seq_printf(m, "\tWQ size %d, offset: 0x%x, tail %d\n",
>   		client->wq_size, client->wq_offset, client->wq_tail);
>   
> -	seq_printf(m, "\tWork queue full: %u\n", client->no_wq_space);
> -
>   	for_each_engine(engine, dev_priv, id) {
>   		u64 submissions = client->submissions[id];
>   		tot += submissions;
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index c180ff1423fd..16b31f70114e 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -406,63 +406,6 @@ static void guc_stage_desc_fini(struct intel_guc *guc,
>   	memset(desc, 0, sizeof(*desc));
>   }
>   
> -/**
> - * i915_guc_wq_reserve() - reserve space in the GuC's workqueue
> - * @request:	request associated with the commands
> - *
> - * Return:	0 if space is available
> - *		-EAGAIN if space is not currently available
> - *
> - * This function must be called (and must return 0) before a request
> - * is submitted to the GuC via i915_guc_submit() below. Once a result
> - * of 0 has been returned, it must be balanced by a corresponding
> - * call to submit().
> - *
> - * Reservation allows the caller to determine in advance that space
> - * will be available for the next submission before committing resources
> - * to it, and helps avoid late failures with complicated recovery paths.
> - */
> -int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
> -{
> -	const size_t wqi_size = sizeof(struct guc_wq_item);
> -	struct i915_guc_client *client = request->i915->guc.execbuf_client;
> -	struct guc_process_desc *desc = __get_process_desc(client);
> -	u32 freespace;
> -	int ret;
> -
> -	spin_lock_irq(&client->wq_lock);
> -	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
> -	freespace -= client->wq_rsvd;
> -	if (likely(freespace >= wqi_size)) {
> -		client->wq_rsvd += wqi_size;
> -		ret = 0;
> -	} else {
> -		client->no_wq_space++;
> -		ret = -EAGAIN;
> -	}
> -	spin_unlock_irq(&client->wq_lock);
> -
> -	return ret;
> -}
> -
> -static void guc_client_update_wq_rsvd(struct i915_guc_client *client, int size)
> -{
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&client->wq_lock, flags);
> -	client->wq_rsvd += size;
> -	spin_unlock_irqrestore(&client->wq_lock, flags);
> -}
> -
> -void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
> -{
> -	const int wqi_size = sizeof(struct guc_wq_item);
> -	struct i915_guc_client *client = request->i915->guc.execbuf_client;
> -
> -	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
> -	guc_client_update_wq_rsvd(client, -wqi_size);
> -}
> -
>   /* Construct a Work Item and append it to the GuC's Work Queue */
>   static void guc_wq_item_append(struct i915_guc_client *client,
>   			       struct drm_i915_gem_request *rq)
> @@ -476,7 +419,7 @@ static void guc_wq_item_append(struct i915_guc_client *client,
>   	struct guc_wq_item *wqi;
>   	u32 freespace, tail, wq_off;
>   
> -	/* Free space is guaranteed, see i915_guc_wq_reserve() above */
> +	/* Free space is guaranteed */
>   	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
>   	GEM_BUG_ON(freespace < wqi_size);
>   
> @@ -491,14 +434,12 @@ static void guc_wq_item_append(struct i915_guc_client *client,
>   	 * workqueue buffer dw by dw.
>   	 */
>   	BUILD_BUG_ON(wqi_size != 16);
> -	GEM_BUG_ON(client->wq_rsvd < wqi_size);
>   
>   	/* postincrement WQ tail for next time */
>   	wq_off = client->wq_tail;
>   	GEM_BUG_ON(wq_off & (wqi_size - 1));
>   	client->wq_tail += wqi_size;
>   	client->wq_tail &= client->wq_size - 1;
> -	client->wq_rsvd -= wqi_size;
>   
>   	/* WQ starts from the page after doorbell / process_desc */
>   	wqi = client->vaddr + wq_off + GUC_DB_SIZE;
> @@ -579,47 +520,43 @@ static int guc_ring_doorbell(struct i915_guc_client *client)
>   }
>   
>   /**
> - * __i915_guc_submit() - Submit commands through GuC
> - * @rq:		request associated with the commands
> - *
> - * The caller must have already called i915_guc_wq_reserve() above with
> - * a result of 0 (success), guaranteeing that there is space in the work
> - * queue for the new request, so enqueuing the item cannot fail.
> - *
> - * Bad Things Will Happen if the caller violates this protocol e.g. calls
> - * submit() when _reserve() says there's no space, or calls _submit()
> - * a different number of times from (successful) calls to _reserve().
> + * i915_guc_submit() - Submit commands through GuC
> + * @engine: engine associated with the commands
>    *
>    * The only error here arises if the doorbell hardware isn't functioning
>    * as expected, which really shouln't happen.
>    */
> -static void __i915_guc_submit(struct drm_i915_gem_request *rq)
> +static void i915_guc_submit(struct intel_engine_cs *engine)
>   {
> -	struct drm_i915_private *dev_priv = rq->i915;
> -	struct intel_engine_cs *engine = rq->engine;
> -	unsigned int engine_id = engine->id;
> -	struct intel_guc *guc = &rq->i915->guc;
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	struct intel_guc *guc = &dev_priv->guc;
>   	struct i915_guc_client *client = guc->execbuf_client;
> +	struct execlist_port *port = engine->execlist_port;
> +	unsigned int engine_id = engine->id;
> +	unsigned int n;
>   	unsigned long flags;
>   
> -	/* WA to flush out the pending GMADR writes to ring buffer. */
> -	if (i915_vma_is_map_and_fenceable(rq->ring->vma))
> -		POSTING_READ_FW(GUC_STATUS);
> +	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> +		struct drm_i915_gem_request *rq;
> +		unsigned int count;
>   
> -	spin_lock_irqsave(&client->wq_lock, flags);
> +		rq = port_unpack(&port[n], &count);
> +		if (rq && count == 0) {
> +			port_set(&port[n], port_pack(rq, ++count));
>   
> -	guc_wq_item_append(client, rq);
> -	WARN_ON(guc_ring_doorbell(client));
> +			if (i915_vma_is_map_and_fenceable(rq->ring->vma))
> +				POSTING_READ_FW(GUC_STATUS);
>   
> -	client->submissions[engine_id] += 1;
> +			spin_lock_irqsave(&client->wq_lock, flags);
>   
> -	spin_unlock_irqrestore(&client->wq_lock, flags);
> -}
> +			guc_wq_item_append(client, rq);
> +			WARN_ON(guc_ring_doorbell(client));
>   
> -static void i915_guc_submit(struct drm_i915_gem_request *rq)
> -{
> -	__i915_gem_request_submit(rq);
> -	__i915_guc_submit(rq);
> +			client->submissions[engine_id] += 1;
> +
> +			spin_unlock_irqrestore(&client->wq_lock, flags);
> +		}
> +	}
>   }
>   
>   static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> @@ -653,16 +590,19 @@ static void port_assign(struct execlist_port *port,
>   	if (port_isset(port))
>   		i915_gem_request_put(port_request(port));
>   
> -	port_set(port, i915_gem_request_get(rq));
> +	port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
>   	nested_enable_signaling(rq);
>   }
>   
> -static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +static void i915_guc_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct execlist_port *port = engine->execlist_port;
> -	struct drm_i915_gem_request *last = port_request(port);
> -	struct rb_node *rb;
> +	struct drm_i915_gem_request *last = NULL;
>   	bool submit = false;
> +	struct rb_node *rb;
> +
> +	if (port_isset(port))
> +		port++;
>   
>   	spin_lock_irq(&engine->timeline->lock);
>   	rb = engine->execlist_first;
> @@ -687,7 +627,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>   			INIT_LIST_HEAD(&rq->priotree.link);
>   			rq->priotree.priority = INT_MAX;
>   
> -			i915_guc_submit(rq);
> +			__i915_gem_request_submit(rq);
>   			trace_i915_gem_request_in(rq, port_index(port, engine));
>   			last = rq;
>   			submit = true;
> @@ -701,11 +641,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>   	}
>   done:
>   	engine->execlist_first = rb;
> -	if (submit)
> +	if (submit) {
>   		port_assign(port, last);
> +		i915_guc_submit(engine);
> +	}
>   	spin_unlock_irq(&engine->timeline->lock);
> -
> -	return submit;
>   }
>   
>   static void i915_guc_irq_handler(unsigned long data)
> @@ -713,24 +653,20 @@ static void i915_guc_irq_handler(unsigned long data)
>   	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
>   	struct execlist_port *port = engine->execlist_port;
>   	struct drm_i915_gem_request *rq;
> -	bool submit;
>   
> -	do {
> -		rq = port_request(&port[0]);
> -		while (rq && i915_gem_request_completed(rq)) {
> -			trace_i915_gem_request_out(rq);
> -			i915_gem_request_put(rq);
> +	rq = port_request(&port[0]);
> +	while (rq && i915_gem_request_completed(rq)) {
> +		trace_i915_gem_request_out(rq);
> +		i915_gem_request_put(rq);
>   
> -			port[0] = port[1];
> -			memset(&port[1], 0, sizeof(port[1]));
> +		port[0] = port[1];
> +		memset(&port[1], 0, sizeof(port[1]));
>   
> -			rq = port_request(&port[0]);
> -		}
> +		rq = port_request(&port[0]);
> +	}
>   
> -		submit = false;
> -		if (!port_count(&port[1]))
> -			submit = i915_guc_dequeue(engine);
> -	} while (submit);
> +	if (!port_isset(&port[1]))
> +		i915_guc_dequeue(engine);
>   }
>   
>   /*
> @@ -1239,6 +1175,19 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>   	enum intel_engine_id id;
>   	int err;
>   
> +	/*
> +	 * We're using GuC work items for submitting work through GuC. Since
> +	 * we're coalescing multiple requests from a single context into a
> +	 * single work item prior to assigning it to execlist_port, we can
> +	 * never have more work items than the total number of ports (for all
> +	 * engines). The GuC firmware is controlling the HEAD of work queue,
> +	 * and it is guaranteed that it will remove the work item from the
> +	 * queue before our request is completed.
> +	 */
> +	BUILD_BUG_ON(ARRAY_SIZE(engine->execlist_port) *
> +		     sizeof(struct guc_wq_item) *
> +		     I915_NUM_ENGINES > GUC_WQ_SIZE);
> +
>   	if (!client) {
>   		client = guc_client_alloc(dev_priv,
>   					  INTEL_INFO(dev_priv)->ring_mask,
> @@ -1266,9 +1215,6 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>   	guc_interrupts_capture(dev_priv);
>   
>   	for_each_engine(engine, dev_priv, id) {
> -		const int wqi_size = sizeof(struct guc_wq_item);
> -		struct drm_i915_gem_request *rq;
> -
>   		/* The tasklet was initialised by execlists, and may be in
>   		 * a state of flux (across a reset) and so we just want to
>   		 * take over the callback without changing any other state
> @@ -1276,14 +1222,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>   		 */
>   		engine->irq_tasklet.func = i915_guc_irq_handler;
>   		clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -
> -		/* Replay the current set of previously submitted requests */
> -		spin_lock_irq(&engine->timeline->lock);
> -		list_for_each_entry(rq, &engine->timeline->requests, link) {
> -			guc_client_update_wq_rsvd(client, wqi_size);
> -			__i915_guc_submit(rq);
> -		}
> -		spin_unlock_irq(&engine->timeline->lock);
> +		tasklet_schedule(&engine->irq_tasklet);
>   	}
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 94a89eff4dbd..86fed9f1f1ae 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -988,27 +988,14 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
>   	 */
>   	request->reserved_space += EXECLISTS_REQUEST_SIZE;
>   
> -	if (i915.enable_guc_submission) {
> -		/*
> -		 * Check that the GuC has space for the request before
> -		 * going any further, as the i915_add_request() call
> -		 * later on mustn't fail ...
> -		 */
> -		ret = i915_guc_wq_reserve(request);
> -		if (ret)
> -			goto err;
> -	}
> -
>   	cs = intel_ring_begin(request, 0);
> -	if (IS_ERR(cs)) {
> -		ret = PTR_ERR(cs);
> -		goto err_unreserve;
> -	}
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
>   
>   	if (!ce->initialised) {
>   		ret = engine->init_context(request);
>   		if (ret)
> -			goto err_unreserve;
> +			return ret;
>   
>   		ce->initialised = true;
>   	}
> @@ -1022,12 +1009,6 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
>   
>   	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
>   	return 0;
> -
> -err_unreserve:
> -	if (i915.enable_guc_submission)
> -		i915_guc_wq_unreserve(request);
> -err:
> -	return ret;
>   }
>   
>   /*
> diff --git a/drivers/gpu/drm/i915/intel_uc.h b/drivers/gpu/drm/i915/intel_uc.h
> index 69daf4c01cd0..d41051688221 100644
> --- a/drivers/gpu/drm/i915/intel_uc.h
> +++ b/drivers/gpu/drm/i915/intel_uc.h
> @@ -52,13 +52,6 @@ struct drm_i915_gem_request;
>    * GuC). The subsequent  pages of the client object constitute the work
>    * queue (a circular array of work items), again described in the process
>    * descriptor. Work queue pages are mapped momentarily as required.
> - *
> - * We also keep a few statistics on failures. Ideally, these should all
> - * be zero!
> - *   no_wq_space: times that the submission pre-check found no space was
> - *                available in the work queue (note, the queue is shared,
> - *                not per-engine). It is OK for this to be nonzero, but
> - *                it should not be huge!
>    */
>   struct i915_guc_client {
>   	struct i915_vma *vma;
> @@ -79,8 +72,6 @@ struct i915_guc_client {
>   	uint32_t wq_offset;
>   	uint32_t wq_size;
>   	uint32_t wq_tail;
> -	uint32_t wq_rsvd;
> -	uint32_t no_wq_space;
>   
>   	/* Per-engine counts of GuC submissions */
>   	uint64_t submissions[I915_NUM_ENGINES];
> @@ -246,8 +237,6 @@ u32 intel_guc_wopcm_size(struct drm_i915_private *dev_priv);
>   /* i915_guc_submission.c */
>   int i915_guc_submission_init(struct drm_i915_private *dev_priv);
>   int i915_guc_submission_enable(struct drm_i915_private *dev_priv);
> -int i915_guc_wq_reserve(struct drm_i915_gem_request *rq);
> -void i915_guc_wq_unreserve(struct drm_i915_gem_request *request);
>   void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
>   void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
>   struct i915_vma *intel_guc_allocate_vma(struct intel_guc *guc, u32 size);