[Intel-gfx] [PATCH 1/2] drm/i915: Reorder await_execution before await_request
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed May 27 07:39:03 UTC 2020
On 26/05/2020 10:07, Chris Wilson wrote:
> Reorder the code so that we can reuse the await_execution from a special
> case in await_request in the next patch.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_request.c | 264 ++++++++++++++--------------
> 1 file changed, 132 insertions(+), 132 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index c282719ad3ac..33bbad623e02 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1053,37 +1053,91 @@ emit_semaphore_wait(struct i915_request *to,
> I915_FENCE_GFP);
> }
>
> +static bool intel_timeline_sync_has_start(struct intel_timeline *tl,
> + struct dma_fence *fence)
> +{
> + return __intel_timeline_sync_is_later(tl,
> + fence->context,
> + fence->seqno - 1);
> +}
> +
> +static int intel_timeline_sync_set_start(struct intel_timeline *tl,
> + const struct dma_fence *fence)
> +{
> + return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1);
> +}
> +
> static int
> -i915_request_await_request(struct i915_request *to, struct i915_request *from)
> +__i915_request_await_execution(struct i915_request *to,
> + struct i915_request *from,
> + void (*hook)(struct i915_request *rq,
> + struct dma_fence *signal))
> {
> - int ret;
> + int err;
>
> - GEM_BUG_ON(to == from);
> - GEM_BUG_ON(to->timeline == from->timeline);
> + GEM_BUG_ON(intel_context_is_barrier(from->context));
>
> - if (i915_request_completed(from)) {
> - i915_sw_fence_set_error_once(&to->submit, from->fence.error);
> + /* Submit both requests at the same time */
> + err = __await_execution(to, from, hook, I915_FENCE_GFP);
> + if (err)
> + return err;
> +
> + /* Squash repeated depenendices to the same timelines */
> + if (intel_timeline_sync_has_start(i915_request_timeline(to),
> + &from->fence))
> return 0;
> +
> + /*
> + * Wait until the start of this request.
> + *
> + * The execution cb fires when we submit the request to HW. But in
> + * many cases this may be long before the request itself is ready to
> + * run (consider that we submit 2 requests for the same context, where
> + * the request of interest is behind an indefinite spinner). So we hook
> + * up to both to reduce our queues and keep the execution lag minimised
> + * in the worst case, though we hope that the await_start is elided.
> + */
> + err = i915_request_await_start(to, from);
> + if (err < 0)
> + return err;
> +
> + /*
> + * Ensure both start together [after all semaphores in signal]
> + *
> + * Now that we are queued to the HW at roughly the same time (thanks
> + * to the execute cb) and are ready to run at roughly the same time
> + * (thanks to the await start), our signaler may still be indefinitely
> + * delayed by waiting on a semaphore from a remote engine. If our
> + * signaler depends on a semaphore, so indirectly do we, and we do not
> + * want to start our payload until our signaler also starts theirs.
> + * So we wait.
> + *
> + * However, there is also a second condition for which we need to wait
> + * for the precise start of the signaler. Consider that the signaler
> + * was submitted in a chain of requests following another context
> + * (with just an ordinary intra-engine fence dependency between the
> + * two). In this case the signaler is queued to HW, but not for
> + * immediate execution, and so we must wait until it reaches the
> + * active slot.
> + */
> + if (intel_engine_has_semaphores(to->engine) &&
> + !i915_request_has_initial_breadcrumb(to)) {
> + err = __emit_semaphore_wait(to, from, from->fence.seqno - 1);
> + if (err < 0)
> + return err;
> }
>
> + /* Couple the dependency tree for PI on this exposed to->fence */
> if (to->engine->schedule) {
> - ret = i915_sched_node_add_dependency(&to->sched,
> + err = i915_sched_node_add_dependency(&to->sched,
> &from->sched,
> - I915_DEPENDENCY_EXTERNAL);
> - if (ret < 0)
> - return ret;
> + I915_DEPENDENCY_WEAK);
> + if (err < 0)
> + return err;
> }
>
> - if (to->engine == from->engine)
> - ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
> - &from->submit,
> - I915_FENCE_GFP);
> - else
> - ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
> - if (ret < 0)
> - return ret;
> -
> - return 0;
> + return intel_timeline_sync_set_start(i915_request_timeline(to),
> + &from->fence);
> }
>
> static void mark_external(struct i915_request *rq)
> @@ -1136,23 +1190,20 @@ i915_request_await_external(struct i915_request *rq, struct dma_fence *fence)
> }
>
> int
> -i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
> +i915_request_await_execution(struct i915_request *rq,
> + struct dma_fence *fence,
> + void (*hook)(struct i915_request *rq,
> + struct dma_fence *signal))
> {
> struct dma_fence **child = &fence;
> unsigned int nchild = 1;
> int ret;
>
> - /*
> - * Note that if the fence-array was created in signal-on-any mode,
> - * we should *not* decompose it into its individual fences. However,
> - * we don't currently store which mode the fence-array is operating
> - * in. Fortunately, the only user of signal-on-any is private to
> - * amdgpu and we should not see any incoming fence-array from
> - * sync-file being in signal-on-any mode.
> - */
> if (dma_fence_is_array(fence)) {
> struct dma_fence_array *array = to_dma_fence_array(fence);
>
> + /* XXX Error for signal-on-any fence arrays */
> +
> child = array->fences;
> nchild = array->num_fences;
> GEM_BUG_ON(!nchild);
> @@ -1165,138 +1216,78 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
> continue;
> }
>
> - /*
> - * Requests on the same timeline are explicitly ordered, along
> - * with their dependencies, by i915_request_add() which ensures
> - * that requests are submitted in-order through each ring.
> - */
> if (fence->context == rq->fence.context)
> continue;
>
> - /* Squash repeated waits to the same timelines */
> - if (fence->context &&
> - intel_timeline_sync_is_later(i915_request_timeline(rq),
> - fence))
> - continue;
> + /*
> + * We don't squash repeated fence dependencies here as we
> + * want to run our callback in all cases.
> + */
>
> if (dma_fence_is_i915(fence))
> - ret = i915_request_await_request(rq, to_request(fence));
> + ret = __i915_request_await_execution(rq,
> + to_request(fence),
> + hook);
> else
> ret = i915_request_await_external(rq, fence);
> if (ret < 0)
> return ret;
> -
> - /* Record the latest fence used against each timeline */
> - if (fence->context)
> - intel_timeline_sync_set(i915_request_timeline(rq),
> - fence);
> } while (--nchild);
>
> return 0;
> }
>
> -static bool intel_timeline_sync_has_start(struct intel_timeline *tl,
> - struct dma_fence *fence)
> -{
> - return __intel_timeline_sync_is_later(tl,
> - fence->context,
> - fence->seqno - 1);
> -}
> -
> -static int intel_timeline_sync_set_start(struct intel_timeline *tl,
> - const struct dma_fence *fence)
> -{
> - return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1);
> -}
> -
> static int
> -__i915_request_await_execution(struct i915_request *to,
> - struct i915_request *from,
> - void (*hook)(struct i915_request *rq,
> - struct dma_fence *signal))
> +i915_request_await_request(struct i915_request *to, struct i915_request *from)
> {
> - int err;
> -
> - GEM_BUG_ON(intel_context_is_barrier(from->context));
> + int ret;
>
> - /* Submit both requests at the same time */
> - err = __await_execution(to, from, hook, I915_FENCE_GFP);
> - if (err)
> - return err;
> + GEM_BUG_ON(to == from);
> + GEM_BUG_ON(to->timeline == from->timeline);
>
> - /* Squash repeated depenendices to the same timelines */
> - if (intel_timeline_sync_has_start(i915_request_timeline(to),
> - &from->fence))
> + if (i915_request_completed(from)) {
> + i915_sw_fence_set_error_once(&to->submit, from->fence.error);
> return 0;
> -
> - /*
> - * Wait until the start of this request.
> - *
> - * The execution cb fires when we submit the request to HW. But in
> - * many cases this may be long before the request itself is ready to
> - * run (consider that we submit 2 requests for the same context, where
> - * the request of interest is behind an indefinite spinner). So we hook
> - * up to both to reduce our queues and keep the execution lag minimised
> - * in the worst case, though we hope that the await_start is elided.
> - */
> - err = i915_request_await_start(to, from);
> - if (err < 0)
> - return err;
> -
> - /*
> - * Ensure both start together [after all semaphores in signal]
> - *
> - * Now that we are queued to the HW at roughly the same time (thanks
> - * to the execute cb) and are ready to run at roughly the same time
> - * (thanks to the await start), our signaler may still be indefinitely
> - * delayed by waiting on a semaphore from a remote engine. If our
> - * signaler depends on a semaphore, so indirectly do we, and we do not
> - * want to start our payload until our signaler also starts theirs.
> - * So we wait.
> - *
> - * However, there is also a second condition for which we need to wait
> - * for the precise start of the signaler. Consider that the signaler
> - * was submitted in a chain of requests following another context
> - * (with just an ordinary intra-engine fence dependency between the
> - * two). In this case the signaler is queued to HW, but not for
> - * immediate execution, and so we must wait until it reaches the
> - * active slot.
> - */
> - if (intel_engine_has_semaphores(to->engine) &&
> - !i915_request_has_initial_breadcrumb(to)) {
> - err = __emit_semaphore_wait(to, from, from->fence.seqno - 1);
> - if (err < 0)
> - return err;
> }
>
> - /* Couple the dependency tree for PI on this exposed to->fence */
> if (to->engine->schedule) {
> - err = i915_sched_node_add_dependency(&to->sched,
> + ret = i915_sched_node_add_dependency(&to->sched,
> &from->sched,
> - I915_DEPENDENCY_WEAK);
> - if (err < 0)
> - return err;
> + I915_DEPENDENCY_EXTERNAL);
> + if (ret < 0)
> + return ret;
> }
>
> - return intel_timeline_sync_set_start(i915_request_timeline(to),
> - &from->fence);
> + if (to->engine == READ_ONCE(from->engine))
> + ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
> + &from->submit,
> + I915_FENCE_GFP);
> + else
> + ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
> + if (ret < 0)
> + return ret;
> +
> + return 0;
> }
>
> int
> -i915_request_await_execution(struct i915_request *rq,
> - struct dma_fence *fence,
> - void (*hook)(struct i915_request *rq,
> - struct dma_fence *signal))
> +i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
> {
> struct dma_fence **child = &fence;
> unsigned int nchild = 1;
> int ret;
>
> + /*
> + * Note that if the fence-array was created in signal-on-any mode,
> + * we should *not* decompose it into its individual fences. However,
> + * we don't currently store which mode the fence-array is operating
> + * in. Fortunately, the only user of signal-on-any is private to
> + * amdgpu and we should not see any incoming fence-array from
> + * sync-file being in signal-on-any mode.
> + */
> if (dma_fence_is_array(fence)) {
> struct dma_fence_array *array = to_dma_fence_array(fence);
>
> - /* XXX Error for signal-on-any fence arrays */
> -
> child = array->fences;
> nchild = array->num_fences;
> GEM_BUG_ON(!nchild);
> @@ -1309,22 +1300,31 @@ i915_request_await_execution(struct i915_request *rq,
> continue;
> }
>
> + /*
> + * Requests on the same timeline are explicitly ordered, along
> + * with their dependencies, by i915_request_add() which ensures
> + * that requests are submitted in-order through each ring.
> + */
> if (fence->context == rq->fence.context)
> continue;
>
> - /*
> - * We don't squash repeated fence dependencies here as we
> - * want to run our callback in all cases.
> - */
> + /* Squash repeated waits to the same timelines */
> + if (fence->context &&
> + intel_timeline_sync_is_later(i915_request_timeline(rq),
> + fence))
> + continue;
>
> if (dma_fence_is_i915(fence))
> - ret = __i915_request_await_execution(rq,
> - to_request(fence),
> - hook);
> + ret = i915_request_await_request(rq, to_request(fence));
> else
> ret = i915_request_await_external(rq, fence);
> if (ret < 0)
> return ret;
> +
> + /* Record the latest fence used against each timeline */
> + if (fence->context)
> + intel_timeline_sync_set(i915_request_timeline(rq),
> + fence);
> } while (--nchild);
>
> return 0;
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list