[Intel-gfx] [PATCH] drm/i915/execlists: Weak parallel submission support for execlists

Wed Oct 27 20:04:49 UTC 2021

On 10/27/2021 12:17, Matthew Brost wrote:
> On Tue, Oct 26, 2021 at 02:58:00PM -0700, John Harrison wrote:
>> On 10/20/2021 14:47, Matthew Brost wrote:
>>> A weak implementation of parallel submission (multi-bb execbuf IOCTL) for
>>> execlists. Doing as little as possible to support this interface for
>>> execlists - basically just passing submit fences between each request
>>> generated and virtual engines are not allowed. This is on par with what
>>> is there for the existing (hopefully soon deprecated) bonding interface.
>>>
>>> We perma-pin these execlists contexts to align with GuC implementation.
>>>
>>> v2:
>>>    (John Harrison)
>>>     - Drop siblings array as num_siblings must be 1
>>>
>>> Signed-off-by: Matthew Brost <matthew.brost at intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gem/i915_gem_context.c   | 10 +++--
>>>    drivers/gpu/drm/i915/gt/intel_context.c       |  4 +-
>>>    .../drm/i915/gt/intel_execlists_submission.c  | 44 ++++++++++++++++++-
>>>    drivers/gpu/drm/i915/gt/intel_lrc.c           |  2 +
>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  2 -
>>>    5 files changed, 52 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> index fb33d0322960..35e87a7d0ea9 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
>>> @@ -570,10 +570,6 @@ set_proto_ctx_engines_parallel_submit(struct i915_user_extension __user *base,
>>>    	struct intel_engine_cs **siblings = NULL;
>>>    	intel_engine_mask_t prev_mask;
>>> -	/* FIXME: This is NIY for execlists */
>>> -	if (!(intel_uc_uses_guc_submission(&i915->gt.uc)))
>>> -		return -ENODEV;
>>> -
>>>    	if (get_user(slot, &ext->engine_index))
>>>    		return -EFAULT;
>>> @@ -583,6 +579,12 @@ set_proto_ctx_engines_parallel_submit(struct i915_user_extension __user *base,
>>>    	if (get_user(num_siblings, &ext->num_siblings))
>>>    		return -EFAULT;
>>> +	if (!intel_uc_uses_guc_submission(&i915->gt.uc) && num_siblings != 1) {
>>> +		drm_dbg(&i915->drm, "Only 1 sibling (%d) supported in non-GuC mode\n",
>>> +			num_siblings);
>>> +		return -EINVAL;
>>> +	}
>>> +
>>>    	if (slot >= set->num_engines) {
>>>    		drm_dbg(&i915->drm, "Invalid placement value, %d >= %d\n",
>>>    			slot, set->num_engines);
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
>>> index 5634d14052bc..1bec92e1d8e6 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_context.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>> @@ -79,7 +79,8 @@ static int intel_context_active_acquire(struct intel_context *ce)
>>>    	__i915_active_acquire(&ce->active);
>>> -	if (intel_context_is_barrier(ce) || intel_engine_uses_guc(ce->engine))
>>> +	if (intel_context_is_barrier(ce) || intel_engine_uses_guc(ce->engine) ||
>>> +	    intel_context_is_parallel(ce))
>>>    		return 0;
>>>    	/* Preallocate tracking nodes */
>>> @@ -563,7 +564,6 @@ void intel_context_bind_parent_child(struct intel_context *parent,
>>>    	 * Callers responsibility to validate that this function is used
>>>    	 * correctly but we use GEM_BUG_ON here ensure that they do.
>>>    	 */
>>> -	GEM_BUG_ON(!intel_engine_uses_guc(parent->engine));
>>>    	GEM_BUG_ON(intel_context_is_pinned(parent));
>>>    	GEM_BUG_ON(intel_context_is_child(parent));
>>>    	GEM_BUG_ON(intel_context_is_pinned(child));
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> index bedb80057046..2865b422300d 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> @@ -927,8 +927,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
>>>    static bool ctx_single_port_submission(const struct intel_context *ce)
>>>    {
>>> -	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
>>> -		intel_context_force_single_submission(ce));
>>> +	return intel_context_force_single_submission(ce);
>> I think this is actually going to break GVT.
>>
>> Not so much this change here but the whole use of single submission outside
>> of GVT. It looks like the GVT driver overloads the single submission flag to
>> tag requests that it owns. If we start using that flag elsewhere when GVT is
>> active, I think that will cause much confusion within the GVT code.
>>
>> The correct fix would be to create a new flag just for GVT usage alongside
>> the single submission one. GVT would then set both but only check for its
>> own private flag. The parallel code would obviously only set the existing
>> single submission flag.
>>
> Ok, see below.
>
>>>    }
>>>    static bool can_merge_ctx(const struct intel_context *prev,
>>> @@ -2598,6 +2597,46 @@ static void execlists_context_cancel_request(struct intel_context *ce,
>>>    				      current->comm);
>>>    }
>>> +static struct intel_context *
>>> +execlists_create_parallel(struct intel_engine_cs **engines,
>>> +			  unsigned int num_siblings,
>>> +			  unsigned int width)
>>> +{
>>> +	struct intel_context *parent = NULL, *ce, *err;
>>> +	int i;
>>> +
>>> +	GEM_BUG_ON(num_siblings != 1);
>>> +
>>> +	for (i = 0; i < width; ++i) {
>>> +		ce = intel_context_create(engines[i]);
>>> +		if (!ce) {
>>> +			err = ERR_PTR(-ENOMEM);
>>> +			goto unwind;
>>> +		}
>>> +
>>> +		if (i == 0)
>>> +			parent = ce;
>>> +		else
>>> +			intel_context_bind_parent_child(parent, ce);
>>> +	}
>>> +
>>> +	parent->parallel.fence_context = dma_fence_context_alloc(1);
>>> +
>>> +	intel_context_set_nopreempt(parent);
>>> +	intel_context_set_single_submission(parent);
>> Can you explain the need for setting single submission?
>>
> I think I can actually pull this out. This was needed when I tried to
> truely implement a guarante that all the parallel requests would be
> running simultaneously. Couldn't ever to get that working because of the
> mess that is the execlists scheduler - a simple wait at the head of
> queue until everyone joined just blew up for whatever reason. I don't
> believe this servers a purpose anymore, so I'll just drop it.
>
> Matt
Is that not going to be a problem? I thought concurrent execution was a 
fundamental requirement?

John.

>
>> John.
>>
>>> +	for_each_child(parent, ce) {
>>> +		intel_context_set_nopreempt(ce);
>>> +		intel_context_set_single_submission(ce);
>>> +	}
>>> +
>>> +	return parent;
>>> +
>>> +unwind:
>>> +	if (parent)
>>> +		intel_context_put(parent);
>>> +	return err;
>>> +}
>>> +
>>>    static const struct intel_context_ops execlists_context_ops = {
>>>    	.flags = COPS_HAS_INFLIGHT,
>>> @@ -2616,6 +2655,7 @@ static const struct intel_context_ops execlists_context_ops = {
>>>    	.reset = lrc_reset,
>>>    	.destroy = lrc_destroy,
>>> +	.create_parallel = execlists_create_parallel,
>>>    	.create_virtual = execlists_create_virtual,
>>>    };
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> index 56156cf18c41..70f4b309522d 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
>>> @@ -1065,6 +1065,8 @@ lrc_pin(struct intel_context *ce,
>>>    void lrc_unpin(struct intel_context *ce)
>>>    {
>>> +	if (unlikely(ce->parallel.last_rq))
>>> +		i915_request_put(ce->parallel.last_rq);
>>>    	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
>>>    		      ce->engine);
>>>    }
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index 1341752dc70e..ddc9a97fcc8f 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -2961,8 +2961,6 @@ static void guc_parent_context_unpin(struct intel_context *ce)
>>>    	GEM_BUG_ON(!intel_context_is_parent(ce));
>>>    	GEM_BUG_ON(!intel_engine_is_virtual(ce->engine));
>>> -	if (ce->parallel.last_rq)
>>> -		i915_request_put(ce->parallel.last_rq);
>>>    	unpin_guc_id(guc, ce);
>>>    	lrc_unpin(ce);
>>>    }