[Intel-gfx] [PATCH 1/3] drm/i915/guc: Add non blocking CTB send function

Thu Nov 21 11:43:26 UTC 2019

On Thu, 21 Nov 2019 00:56:02 +0100, <John.C.Harrison at intel.com> wrote:

> From: Matthew Brost <matthew.brost at intel.com>
>
> Add non blocking CTB send fuction, intel_guc_send_nb. In order to
> support a non blocking CTB send fuction a spin lock is needed to

2x typos

> protect the CTB descriptors fields. Also the non blocking call must not
> update the fence value as this value is owned by the blocking call
> (intel_guc_send).

you probably mean "intel_guc_send_ct", as intel_guc_send is just a wrapper
around guc->send

>
> The blocking CTB now must have a flow control mechanism to ensure the
> buffer isn't overrun. A lazy spin wait is used as we believe the flow
> control condition should be rare with properly sized buffer. A retry
> counter is also implemented which fails H2G CTBs once a limit is
> reached to prevent deadlock.
>
> The function, intel_guc_send_nb, is exported in this patch but unused.
> Several patches later in the series make use of this function.

It's likely in yet another series

>
> Cc: John Harrison <john.c.harrison at intel.com>
> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> ---
>  drivers/gpu/drm/i915/gt/uc/intel_guc.h    |  2 +
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 97 +++++++++++++++++++----
>  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h | 10 ++-
>  3 files changed, 91 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h  
> b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index e6400204a2bd..77c5af919ace 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -94,6 +94,8 @@ intel_guc_send_and_receive(struct intel_guc *guc,  
> const u32 *action, u32 len,
>  	return guc->send(guc, action, len, response_buf, response_buf_size);
>  }
> +int intel_guc_send_nb(struct intel_guc_ct *ct, const u32 *action, u32  
> len);
> +

Hmm, this mismatch of guc/ct parameter breaks the our layering.
But we can keep this layering intact by introducing some flags to
the existing guc_send() function. These flags could be passed as
high bits in action[0], like this:

#define GUC_ACTION_FLAG_DONT_WAIT 0x80000000

int intel_guc_auth_huc(struct intel_guc *guc, u32 rsa_offset)
{
	u32 action[] = {
		INTEL_GUC_ACTION_AUTHENTICATE_HUC | GUC_ACTION_FLAG_DONT_WAIT,
		rsa_offset
	};

	return intel_guc_send(guc, action, ARRAY_SIZE(action));
}

then actual back-end of guc->send can take proper steps based on this flag:

@@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *action, u32  
len,
         GEM_BUG_ON(!len);
         GEM_BUG_ON(len > guc->send_regs.count);

+       if (*action & GUC_ACTION_FLAG_DONT_WAIT)
+               return -EINVAL;
+       *action &= ~GUC_ACTION_FLAG_DONT_WAIT;
+
         /* We expect only action code */
         GEM_BUG_ON(*action & ~INTEL_GUC_MSG_CODE_MASK);

@@ @@ int intel_guc_send_ct(struct intel_guc *guc, const u32 *action, u32  
len,
         u32 status = ~0; /* undefined */
         int ret;

+       if (*action & GUC_ACTION_FLAG_DONT_WAIT) {
+               GEM_BUG_ON(response_buf);
+               GEM_BUG_ON(response_buf_size);
+               return ctch_send_nb(ct, ctch, action, len);
+       }
+
         mutex_lock(&guc->send_mutex);

         ret = ctch_send(ct, ctch, action, len, response_buf,  
response_buf_size,


>  static inline void intel_guc_notify(struct intel_guc *guc)
>  {
>  	guc->notify(guc);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c  
> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> index b49115517510..e50d968b15d5 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> @@ -3,6 +3,8 @@
>   * Copyright © 2016-2019 Intel Corporation
>   */
> +#include <linux/circ_buf.h>
> +
>  #include "i915_drv.h"
>  #include "intel_guc_ct.h"
> @@ -12,6 +14,8 @@
>  #define CT_DEBUG_DRIVER(...)	do { } while (0)
>  #endif
> +#define MAX_RETRY		0x1000000
> +
>  struct ct_request {
>  	struct list_head link;
>  	u32 fence;
> @@ -40,7 +44,8 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct)
>  	/* we're using static channel owners */
>  	ct->host_channel.owner = CTB_OWNER_HOST;
> -	spin_lock_init(&ct->lock);
> +	spin_lock_init(&ct->request_lock);
> +	spin_lock_init(&ct->send_lock);
>  	INIT_LIST_HEAD(&ct->pending_requests);
>  	INIT_LIST_HEAD(&ct->incoming_requests);
>  	INIT_WORK(&ct->worker, ct_incoming_request_worker_func);
> @@ -291,7 +296,8 @@ static u32 ctch_get_next_fence(struct  
> intel_guc_ct_channel *ctch)
>  static int ctb_write(struct intel_guc_ct_buffer *ctb,
>  		     const u32 *action,
>  		     u32 len /* in dwords */,
> -		     u32 fence,
> +		     u32 fence_value,
> +		     bool enable_fence,

maybe we can just guarantee that fence=0 will never be used as a valid
fence id, then this flag could be replaced with (fence != 0) check.

>  		     bool want_response)
>  {
>  	struct guc_ct_buffer_desc *desc = ctb->desc;
> @@ -328,18 +334,18 @@ static int ctb_write(struct intel_guc_ct_buffer  
> *ctb,
>  	 * DW2+: action data
>  	 */
>  	header = (len << GUC_CT_MSG_LEN_SHIFT) |
> -		 (GUC_CT_MSG_WRITE_FENCE_TO_DESC) |
> +		 (enable_fence ? GUC_CT_MSG_WRITE_FENCE_TO_DESC : 0) |

Hmm, even if we ask fw to do not write back fence to the descriptor,
IIRC current firmware will unconditionally write back return status
of this non-blocking call, possibly overwriting status of the blocked
call.

>  		 (want_response ? GUC_CT_MSG_SEND_STATUS : 0) |

btw, if we switch all requests to expect reply send back over CTB,
then we can possibly drop the send_mutex in CTB paths, and block
only when there is no DONT_WAIT flag and we have to wait for response.

>  		 (action[0] << GUC_CT_MSG_ACTION_SHIFT);
> 	CT_DEBUG_DRIVER("CT: writing %*ph %*ph %*ph\n",
> -			4, &header, 4, &fence,
> +			4, &header, 4, &fence_value,
>  			4 * (len - 1), &action[1]);
> 	cmds[tail] = header;
>  	tail = (tail + 1) % size;
> -	cmds[tail] = fence;
> +	cmds[tail] = fence_value;
>  	tail = (tail + 1) % size;
> 	for (i = 1; i < len; i++) {
> @@ -440,6 +446,47 @@ static int wait_for_ct_request_update(struct  
> ct_request *req, u32 *status)
>  	return err;
>  }
> +static inline bool ctb_has_room(struct guc_ct_buffer_desc *desc, u32  
> len)
> +{
> +	u32 head = READ_ONCE(desc->head);
> +	u32 space;
> +
> +	space = CIRC_SPACE(desc->tail, head, desc->size);
> +
> +	return space >= len;
> +}
> +
> +int intel_guc_send_nb(struct intel_guc_ct *ct,
> +		      const u32 *action,
> +		      u32 len)
> +{
> +	struct intel_guc_ct_channel *ctch = &ct->host_channel;
> +	struct intel_guc_ct_buffer *ctb = &ctch->ctbs[CTB_SEND];
> +	struct guc_ct_buffer_desc *desc = ctb->desc;
> +	int err;
> +
> +	GEM_BUG_ON(!ctch->enabled);
> +	GEM_BUG_ON(!len);
> +	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
> +	lockdep_assert_held(&ct->send_lock);

hmm, does it mean that now it's caller responsibility to spinlock
on CT private lock ? That is not how other guc_send() functions work.

> +
> +	if (unlikely(!ctb_has_room(desc, (len + 1) * 4))) {
> +		ct->retry++;
> +		if (ct->retry >= MAX_RETRY)
> +			return -EDEADLK;
> +		else
> +			return -EBUSY;
> +	}
> +
> +	ct->retry = 0;
> +	err = ctb_write(ctb, action, len, 0, false, false);
> +	if (unlikely(err))
> +		return err;
> +
> +	intel_guc_notify(ct_to_guc(ct));
> +	return 0;
> +}
> +
>  static int ctch_send(struct intel_guc_ct *ct,
>  		     struct intel_guc_ct_channel *ctch,
>  		     const u32 *action,
> @@ -460,17 +507,35 @@ static int ctch_send(struct intel_guc_ct *ct,
>  	GEM_BUG_ON(len & ~GUC_CT_MSG_LEN_MASK);
>  	GEM_BUG_ON(!response_buf && response_buf_size);
> +	/*
> +	 * We use a lazy spin wait loop here as we believe that if the CT
> +	 * buffers are sized correctly the flow control condition should be
> +	 * rare.
> +	 */
> +retry:
> +	spin_lock_irqsave(&ct->send_lock, flags);
> +	if (unlikely(!ctb_has_room(desc, (len + 1) * 4))) {
> +		spin_unlock_irqrestore(&ct->send_lock, flags);
> +		ct->retry++;
> +		if (ct->retry >= MAX_RETRY)
> +			return -EDEADLK;

I'm not sure what's better: have secret deadlock hard to reproduce,
or deadlocks easier to catch that helps improve to be deadlock-clean

> +		cpu_relax();
> +		goto retry;
> +	}
> +
> +	ct->retry = 0;
>  	fence = ctch_get_next_fence(ctch);
>  	request.fence = fence;
>  	request.status = 0;
>  	request.response_len = response_buf_size;
>  	request.response_buf = response_buf;
> -	spin_lock_irqsave(&ct->lock, flags);
> +	spin_lock(&ct->request_lock);
>  	list_add_tail(&request.link, &ct->pending_requests);
> -	spin_unlock_irqrestore(&ct->lock, flags);
> +	spin_unlock(&ct->request_lock);
> -	err = ctb_write(ctb, action, len, fence, !!response_buf);
> +	err = ctb_write(ctb, action, len, fence, true, !!response_buf);
> +	spin_unlock_irqrestore(&ct->send_lock, flags);
>  	if (unlikely(err))
>  		goto unlink;
> @@ -501,9 +566,9 @@ static int ctch_send(struct intel_guc_ct *ct,
>  	}
> unlink:
> -	spin_lock_irqsave(&ct->lock, flags);
> +	spin_lock_irqsave(&ct->request_lock, flags);
>  	list_del(&request.link);
> -	spin_unlock_irqrestore(&ct->lock, flags);
> +	spin_unlock_irqrestore(&ct->request_lock, flags);
> 	return err;
>  }
> @@ -653,7 +718,7 @@ static int ct_handle_response(struct intel_guc_ct  
> *ct, const u32 *msg)
> 	CT_DEBUG_DRIVER("CT: response fence %u status %#x\n", fence, status);
> -	spin_lock(&ct->lock);
> +	spin_lock(&ct->request_lock);
>  	list_for_each_entry(req, &ct->pending_requests, link) {
>  		if (unlikely(fence != req->fence)) {
>  			CT_DEBUG_DRIVER("CT: request %u awaits response\n",
> @@ -672,7 +737,7 @@ static int ct_handle_response(struct intel_guc_ct  
> *ct, const u32 *msg)
>  		found = true;
>  		break;
>  	}
> -	spin_unlock(&ct->lock);
> +	spin_unlock(&ct->request_lock);
> 	if (!found)
>  		DRM_ERROR("CT: unsolicited response %*ph\n", 4 * msglen, msg);
> @@ -710,13 +775,13 @@ static bool ct_process_incoming_requests(struct  
> intel_guc_ct *ct)
>  	u32 *payload;
>  	bool done;
> -	spin_lock_irqsave(&ct->lock, flags);
> +	spin_lock_irqsave(&ct->request_lock, flags);
>  	request = list_first_entry_or_null(&ct->incoming_requests,
>  					   struct ct_incoming_request, link);
>  	if (request)
>  		list_del(&request->link);
>  	done = !!list_empty(&ct->incoming_requests);
> -	spin_unlock_irqrestore(&ct->lock, flags);
> +	spin_unlock_irqrestore(&ct->request_lock, flags);
> 	if (!request)
>  		return true;
> @@ -777,9 +842,9 @@ static int ct_handle_request(struct intel_guc_ct  
> *ct, const u32 *msg)
>  	}
>  	memcpy(request->msg, msg, 4 * msglen);
> -	spin_lock_irqsave(&ct->lock, flags);
> +	spin_lock_irqsave(&ct->request_lock, flags);
>  	list_add_tail(&request->link, &ct->incoming_requests);
> -	spin_unlock_irqrestore(&ct->lock, flags);
> +	spin_unlock_irqrestore(&ct->request_lock, flags);
> 	queue_work(system_unbound_wq, &ct->worker);
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h  
> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> index 7c24d83f5c24..bc670a796bd8 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h
> @@ -62,8 +62,11 @@ struct intel_guc_ct {
>  	struct intel_guc_ct_channel host_channel;
>  	/* other channels are tbd */
> -	/** @lock: protects pending requests list */
> -	spinlock_t lock;
> +	/** @request_lock: protects pending requests list */
> +	spinlock_t request_lock;
> +
> +	/** @send_lock: protects h2g channel */
> +	spinlock_t send_lock;
> 	/** @pending_requests: list of requests waiting for response */
>  	struct list_head pending_requests;
> @@ -73,6 +76,9 @@ struct intel_guc_ct {
> 	/** @worker: worker for handling incoming requests */
>  	struct work_struct worker;
> +
> +	/** @retry: the number of times a H2G CTB has been retried */
> +	u32 retry;
>  };
> void intel_guc_ct_init_early(struct intel_guc_ct *ct);