[Intel-gfx] [PATCH v3 3/8] drm/i915/guc: Submit GuC workitems containing coalesced requests

Tue May 23 00:07:35 UTC 2017

On 22/05/17 15:07, Michał Winiarski wrote:
> To create an upper bound on number of GuC workitems, we need to change
> the way that requests are being submitted. Rather than submitting each
> request as an individual workitem, we can do coalescing in a similar way
> we're handlig execlist submission ports. We also need to stop pretending
> that we're doing "lite-restore" in GuC submission (we would create a
> workitem each time we hit this condition).
>
> v2: Also coalesce when replaying on reset (Daniele)
> v3: Consistent wq_resv - per-request (Daniele)
>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> Cc: Jeff McGee <jeff.mcgee at intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko at intel.com>
> Cc: Oscar Mateo <oscar.mateo at intel.com>
> Signed-off-by: Michał Winiarski <michal.winiarski at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 72 +++++++++++++++---------------
>  1 file changed, 37 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index e6e0c6e..2a0c3161 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -491,14 +491,12 @@ static void guc_wq_item_append(struct i915_guc_client *client,
>  	 * workqueue buffer dw by dw.
>  	 */
>  	BUILD_BUG_ON(wqi_size != 16);
> -	GEM_BUG_ON(client->wq_rsvd < wqi_size);
>
>  	/* postincrement WQ tail for next time */
>  	wq_off = client->wq_tail;
>  	GEM_BUG_ON(wq_off & (wqi_size - 1));
>  	client->wq_tail += wqi_size;
>  	client->wq_tail &= client->wq_size - 1;
> -	client->wq_rsvd -= wqi_size;
>
>  	/* WQ starts from the page after doorbell / process_desc */
>  	wqi = client->vaddr + wq_off + GUC_DB_SIZE;
> @@ -580,7 +578,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client)
>  }
>
>  /**
> - * __i915_guc_submit() - Submit commands through GuC
> + * i915_guc_submit() - Submit commands through GuC
>   * @rq:		request associated with the commands
>   *
>   * The caller must have already called i915_guc_wq_reserve() above with
> @@ -594,7 +592,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client)
>   * The only error here arises if the doorbell hardware isn't functioning
>   * as expected, which really shouln't happen.
>   */
> -static void __i915_guc_submit(struct drm_i915_gem_request *rq)
> +static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
>  	struct drm_i915_private *dev_priv = rq->i915;
>  	struct intel_engine_cs *engine = rq->engine;
> @@ -618,12 +616,6 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock_irqrestore(&client->wq_lock, flags);
>  }
>
> -static void i915_guc_submit(struct drm_i915_gem_request *rq)
> -{
> -	__i915_gem_request_submit(rq);
> -	__i915_guc_submit(rq);
> -}
> -
>  static void nested_enable_signaling(struct drm_i915_gem_request *rq)
>  {
>  	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> @@ -659,12 +651,15 @@ static void port_assign(struct execlist_port *port,
>  	nested_enable_signaling(rq);
>  }
>
> -static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +static void i915_guc_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct execlist_port *port = engine->execlist_port;
> -	struct drm_i915_gem_request *last = port_request(port);
> -	struct rb_node *rb;
> +	struct drm_i915_gem_request *last = NULL;
>  	bool submit = false;
> +	struct rb_node *rb;
> +
> +	if (port_request(port))
> +		port++;
>
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = engine->execlist_first;
> @@ -681,18 +676,22 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  					goto done;
>  				}
>
> -				if (submit)
> +				if (submit) {
>  					port_assign(port, last);
> +					i915_guc_submit(last);
> +					submit = false;
> +				}
>  				port++;
>  			}
>
>  			INIT_LIST_HEAD(&rq->priotree.link);
>  			rq->priotree.priority = INT_MAX;
>
> -			i915_guc_submit(rq);
> +			__i915_gem_request_submit(rq);
>  			trace_i915_gem_request_in(rq, port_index(port, engine));
>  			last = rq;
>  			submit = true;
> +			i915_guc_wq_unreserve(rq);

Bikeshed: doesn't moving the removal of the reservation to before the 
actual WQ tail update actually theoretically introduce the possibility 
of racing over-reserving? It can't hurt anyway because of the 
coalescing, but it might be worth squashing the next patch into this one 
to make things cleaner.

>  		}
>
>  		rb = rb_next(rb);
> @@ -703,11 +702,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  	}
>  done:
>  	engine->execlist_first = rb;
> -	if (submit)
> +	if (submit) {
>  		port_assign(port, last);
> +		i915_guc_submit(last);
> +	}
>  	spin_unlock_irq(&engine->timeline->lock);
> -
> -	return submit;
>  }
>
>  static void i915_guc_irq_handler(unsigned long data)
> @@ -715,24 +714,20 @@ static void i915_guc_irq_handler(unsigned long data)
>  	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
>  	struct execlist_port *port = engine->execlist_port;
>  	struct drm_i915_gem_request *rq;
> -	bool submit;
>
> -	do {
> -		rq = port_request(&port[0]);
> -		while (rq && i915_gem_request_completed(rq)) {
> -			trace_i915_gem_request_out(rq);
> -			i915_gem_request_put(rq);
> +	rq = port_request(&port[0]);
> +	while (rq && i915_gem_request_completed(rq)) {
> +		trace_i915_gem_request_out(rq);
> +		i915_gem_request_put(rq);
>
> -			port[0] = port[1];
> -			memset(&port[1], 0, sizeof(port[1]));
> +		port[0] = port[1];
> +		memset(&port[1], 0, sizeof(port[1]));
>
> -			rq = port_request(&port[0]);
> -		}
> +		rq = port_request(&port[0]);
> +	}
>
> -		submit = false;
> -		if (!port_count(&port[1]))
> -			submit = i915_guc_dequeue(engine);
> -	} while (submit);
> +	if (!port_isset(&port[1]))
> +		i915_guc_dequeue(engine);
>  }
>
>  /*
> @@ -1250,6 +1245,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	for_each_engine(engine, dev_priv, id) {
>  		const int wqi_size = sizeof(struct guc_wq_item);
>  		struct drm_i915_gem_request *rq;
> +		struct execlist_port *port = engine->execlist_port;
> +		int n;
>
>  		/* The tasklet was initialised by execlists, and may be in
>  		 * a state of flux (across a reset) and so we just want to
> @@ -1261,11 +1258,16 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  		/* Replay the current set of previously submitted requests */
>  		spin_lock_irq(&engine->timeline->lock);
> -		list_for_each_entry(rq, &engine->timeline->requests, link) {
> +		list_for_each_entry(rq, &engine->timeline->requests, link)
>  			guc_client_update_wq_rsvd(client, wqi_size);

Can we just drop this entirely due to the new changes in this version of 
the patch? We don't require wq_rsvd to be > 0 in i915_guc_submit anymore 
and we're guaranteed to have space (the wq is empty and we're only going 
to submit up to 2 items). Otherwise, aren't we going to leak this rsvd 
space since we don't clean it in i915_guc_submit anymore?

Thanks,
Daniele

> -			__i915_guc_submit(rq);
> -		}
>  		spin_unlock_irq(&engine->timeline->lock);
> +
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> +			if (!port_isset(&port[n]))
> +				break;
> +
> +			i915_guc_submit(port_request(&port[n]));
> +		}
>  	}
>
>  	return 0;
>