[Intel-gfx] [PATCH 9/9] drm/i915/gt: Schedule request retirement when timeline idles
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Nov 20 13:16:51 UTC 2019
On 20/11/2019 09:33, Chris Wilson wrote:
> The major drawback of commit 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX
> corruption WA") is that it disables RC6 while Skylake (and friends) is
> active, and we do not consider the GPU idle until all outstanding
> requests have been retired and the engine switched over to the kernel
> context. If userspace is idle, this task falls onto our background idle
> worker, which only runs roughly once a second, meaning that userspace has
> to have been idle for a couple of seconds before we enable RC6 again.
> Naturally, this causes us to consume considerably more energy than
> before as powersaving is effectively disabled while a display server
> (here's looking at you Xorg) is running.
>
> As execlists will get a completion event as each context is completed,
> we can use this interrupt to queue a retire worker bound to this engine
> to cleanup idle timelines. We will then immediately notice the idle
> engine (without userspace intervention or the aid of the background
> retire worker) and start parking the GPU. Thus during light workloads,
> we will do much more work to idle the GPU faster... Hopefully with
> commensurate power saving!
>
> v2: Watch context completions and only look at those local to the engine
> when retiring to reduce the amount of excess work we perform.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=112315
> References: 7e34f4e4aad3 ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> References: 2248a28384fe ("drm/i915/gen8+: Add RC6 CTX corruption WA")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 8 +-
> drivers/gpu/drm/i915/gt/intel_engine_types.h | 8 ++
> drivers/gpu/drm/i915/gt/intel_gt_requests.c | 74 +++++++++++++++++++
> drivers/gpu/drm/i915/gt/intel_gt_requests.h | 17 ++++-
> drivers/gpu/drm/i915/gt/intel_lrc.c | 9 +++
> drivers/gpu/drm/i915/gt/intel_timeline.c | 1 +
> .../gpu/drm/i915/gt/intel_timeline_types.h | 3 +
> 7 files changed, 116 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index b9613d044393..8f6e353caa66 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -28,13 +28,13 @@
>
> #include "i915_drv.h"
>
> -#include "gt/intel_gt.h"
> -
> +#include "intel_context.h"
> #include "intel_engine.h"
> #include "intel_engine_pm.h"
> #include "intel_engine_pool.h"
> #include "intel_engine_user.h"
> -#include "intel_context.h"
> +#include "intel_gt.h"
> +#include "intel_gt_requests.h"
> #include "intel_lrc.h"
> #include "intel_reset.h"
> #include "intel_ring.h"
> @@ -617,6 +617,7 @@ static int intel_engine_setup_common(struct intel_engine_cs *engine)
> intel_engine_init_execlists(engine);
> intel_engine_init_cmd_parser(engine);
> intel_engine_init__pm(engine);
> + intel_engine_init_retire(engine);
>
> intel_engine_pool_init(&engine->pool);
>
> @@ -839,6 +840,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
>
> cleanup_status_page(engine);
>
> + intel_engine_fini_retire(engine);
> intel_engine_pool_fini(&engine->pool);
> intel_engine_fini_breadcrumbs(engine);
> intel_engine_cleanup_cmd_parser(engine);
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 758f0e8ec672..17f1f1441efc 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -451,6 +451,14 @@ struct intel_engine_cs {
>
> struct intel_engine_execlists execlists;
>
> + /*
> + * Keep track of completed timelines on this engine for early
> + * retirement with the goal of quickly enabling powersaving as
> + * soon as the engine is idle.
> + */
> + struct intel_timeline *retire;
> + struct work_struct retire_work;
> +
> /* status_notifier: list of callbacks for context-switch changes */
> struct atomic_notifier_head context_status_notifier;
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> index 4dc3cbeb1b36..4a98fefdf915 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c
> @@ -29,6 +29,80 @@ static void flush_submission(struct intel_gt *gt)
> intel_engine_flush_submission(engine);
> }
>
> +static void engine_retire(struct work_struct *work)
> +{
> + struct intel_engine_cs *engine =
> + container_of(work, typeof(*engine), retire_work);
> + struct intel_timeline *tl = xchg(&engine->retire, NULL);
Shouldn't this be atomic_xchg to avoid racing with add_retire?
> +
> + do {
> + struct intel_timeline *next = xchg(&tl->retire, NULL);
Here as well?
> +
> + /*
> + * Our goal here is to retire _idle_ timelines as soon as
> + * possible (as they are idle, we do not expect userspace
> + * to be cleaning up anytime soon).
> + *
> + * If the tl->active_count is already zero, someone else
> + * should have retired the timeline. Equally if the timeline
> + * is currently locked, either it is being retired elsewhere
> + * or about to be!
> + */
> + if (atomic_read(&tl->active_count) &&
> + mutex_trylock(&tl->mutex)) {
> + retire_requests(tl);
> + mutex_unlock(&tl->mutex);
> + }
> + intel_timeline_put(tl);
> +
> + GEM_BUG_ON(!next);
> + tl = ptr_mask_bits(next, 1);
You sometimes expect engine->retire to contain 0x1?
> + } while (tl);
> +}
> +
> +static bool add_retire(struct intel_engine_cs *engine,
> + struct intel_timeline *tl)
> +{
> + struct intel_timeline *first = READ_ONCE(engine->retire);
> +
> + /*
> + * We open-code a llist here to include the additional tag [BIT(0)]
> + * so that we know when the timeline is already on a
> + * retirement queue: either this engine or another.
> + *
> + * However, we rely on that a timeline can only be active on a single
> + * engine at any one time and that add_retire() is called before the
> + * engine releases the timeline and transferred to another to retire.
> + */
> +
> + if (READ_ONCE(tl->retire)) /* already queued */
> + return false;
Can't this go first in the function?
> +
> + intel_timeline_get(tl);
> + do
> + tl->retire = ptr_pack_bits(first, 1, 1);
Here you rely on assignment being atomic right?
> + while (!try_cmpxchg(&engine->retire, &first, tl));
So the loop is effectively creating a chain of timelines to retire on
this engine.
What happens with virtual engines when a timeline goes to different
engine before (well or any single timeline context) the retire worker
runs? Ah okay, it gets re-assigned to the most recent engine.
I am not sure about the BIT(0) business. It's always set on write so I
am not getting why it is useful.
Regards,
Tvrtko
> +
> + return !first;
> +}
> +
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> + struct intel_timeline *tl)
> +{
> + if (add_retire(engine, tl))
> + schedule_work(&engine->retire_work);
> +}
> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine)
> +{
> + INIT_WORK(&engine->retire_work, engine_retire);
> +}
> +
> +void intel_engine_fini_retire(struct intel_engine_cs *engine)
> +{
> + flush_work(&engine->retire_work);
> +}
> +
> long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout)
> {
> struct intel_gt_timelines *timelines = >->timelines;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.h b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> index fde546424c63..8de559b5a033 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.h
> @@ -7,7 +7,12 @@
> #ifndef INTEL_GT_REQUESTS_H
> #define INTEL_GT_REQUESTS_H
>
> -struct intel_gt;
> +#include <linux/workqueue.h>
> +
> +#include "intel_gt_types.h"
> +
> +struct intel_engine_cs;
> +struct intel_timeline;
>
> long intel_gt_retire_requests_timeout(struct intel_gt *gt, long timeout);
> static inline void intel_gt_retire_requests(struct intel_gt *gt)
> @@ -15,6 +20,16 @@ static inline void intel_gt_retire_requests(struct intel_gt *gt)
> intel_gt_retire_requests_timeout(gt, 0);
> }
>
> +static inline void intel_gt_schedule_retire_requests(struct intel_gt *gt)
> +{
> + mod_delayed_work(system_wq, >->requests.retire_work, 0);
> +}
This is unused in v2.
> +
> +void intel_engine_init_retire(struct intel_engine_cs *engine);
> +void intel_engine_add_retire(struct intel_engine_cs *engine,
> + struct intel_timeline *tl);
> +void intel_engine_fini_retire(struct intel_engine_cs *engine);
> +
> int intel_gt_wait_for_idle(struct intel_gt *gt, long timeout);
>
> void intel_gt_init_requests(struct intel_gt *gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index b65bc06855b0..2ceaa2f22996 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -142,6 +142,7 @@
> #include "intel_engine_pm.h"
> #include "intel_gt.h"
> #include "intel_gt_pm.h"
> +#include "intel_gt_requests.h"
> #include "intel_lrc_reg.h"
> #include "intel_mocs.h"
> #include "intel_reset.h"
> @@ -1170,6 +1171,14 @@ __execlists_schedule_out(struct i915_request *rq,
> * refrain from doing non-trivial work here.
> */
>
> + /*
> + * If we have just completed this context, the engine may now be
> + * idle and we want to re-enter powersaving.
> + */
> + if (list_is_last(&rq->link, &ce->timeline->requests) &&
> + i915_request_completed(rq))
> + intel_engine_add_retire(engine, ce->timeline);
> +
> intel_engine_context_out(engine);
> execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
> intel_gt_pm_put_async(engine->gt);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index b190a5d9ab02..c1d2419444f8 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -277,6 +277,7 @@ void intel_timeline_fini(struct intel_timeline *timeline)
> {
> GEM_BUG_ON(atomic_read(&timeline->pin_count));
> GEM_BUG_ON(!list_empty(&timeline->requests));
> + GEM_BUG_ON(timeline->retire);
>
> if (timeline->hwsp_cacheline)
> cacheline_free(timeline->hwsp_cacheline);
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> index 5244615ed1cb..aaf15cbe1ce1 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
> @@ -66,6 +66,9 @@ struct intel_timeline {
> */
> struct i915_active_fence last_request;
>
> + /** A chain of completed timelines ready for early retirement. */
> + struct intel_timeline *retire;
> +
> /**
> * We track the most recent seqno that we wait on in every context so
> * that we only have to emit a new await and dependency on a more
>
More information about the Intel-gfx
mailing list