[Intel-gfx] [PATCH v4] drm/i915/gt: Poison residual state [HWSP] across resume.
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Apr 21 14:47:10 UTC 2020
On 21/04/2020 10:25, Chris Wilson wrote:
> Since we may lose the content of any buffer when we relinquish control
> of the system (e.g. suspend/resume), we have to be careful not to rely
> on regaining control. A good method to detect when we might be using
> garbage is by always injecting that garbage prior to first use on
> load/resume/etc.
>
> v2: Drop sanitize callback on cleanup
> v3: Move seqno reset to timeline enter, so we reset all timelines.
> However, this is done on every activation during runtime and not reset.
> The similar level of paranoia we apply to correcting context state after
> a period of inactivity.
>
> Suggested-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Venkata Ramana Nayana <venkata.ramana.nayana at intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> ---
> Reset in sanitize, for we may attempt to park the engine before using
> any timelines.
> ---
> drivers/gpu/drm/i915/gt/intel_lrc.c | 23 ++++++++++++++++++++++-
> drivers/gpu/drm/i915/gt/intel_timeline.c | 17 ++++++++++++++++-
> drivers/gpu/drm/i915/gt/intel_timeline.h | 2 ++
> 3 files changed, 40 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 34f67eb9bfa1..d42a9d6767d4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -3649,7 +3649,26 @@ static void reset_csb_pointers(struct intel_engine_cs *engine)
>
> static void execlists_sanitize(struct intel_engine_cs *engine)
> {
> + /*
> + * Poison residual state on resume, in case the suspend didn't!
> + *
> + * We have to assume that across suspend/resume (or other loss
> + * of control) that the contents of our pinned buffers has been
> + * lost, replaced by garbage. Since this doesn't always happen,
> + * let's poison such state so that we more quickly spot when
> + * we falsely assume it has been preserved.
> + */
> + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> + memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
> +
> reset_csb_pointers(engine);
> +
> + /*
> + * The kernel_context HWSP is stored in the status_page. As above,
> + * that may be lost on resume/initialisation, and so we need to
> + * reset the value in the HWSP.
> + */
> + intel_timeline_reset_seqno(engine->kernel_context->timeline);
> }
>
> static void enable_error_interrupt(struct intel_engine_cs *engine)
> @@ -4539,6 +4558,8 @@ static void execlists_shutdown(struct intel_engine_cs *engine)
>
> static void execlists_release(struct intel_engine_cs *engine)
> {
> + engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
> +
> execlists_shutdown(engine);
>
> intel_engine_cleanup_common(engine);
> @@ -4550,7 +4571,6 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
> {
> /* Default vfuncs which can be overriden by each engine. */
>
> - engine->sanitize = execlists_sanitize;
> engine->resume = execlists_resume;
>
> engine->cops = &execlists_context_ops;
> @@ -4666,6 +4686,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
> execlists->csb_size = GEN11_CSB_ENTRIES;
>
> /* Finally, take ownership and responsibility for cleanup! */
> + engine->sanitize = execlists_sanitize;
> engine->release = execlists_release;
>
> return 0;
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
> index 3779c2ae0d65..29a39e44fa36 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.c
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
> @@ -337,6 +337,13 @@ int intel_timeline_pin(struct intel_timeline *tl)
> return 0;
> }
>
> +void intel_timeline_reset_seqno(const struct intel_timeline *tl)
> +{
> + /* Must be pinned to be writable, and no requests in flight. */
> + GEM_BUG_ON(!atomic_read(&tl->pin_count));
> + WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
> +}
> +
> void intel_timeline_enter(struct intel_timeline *tl)
> {
> struct intel_gt_timelines *timelines = &tl->gt->timelines;
> @@ -365,8 +372,16 @@ void intel_timeline_enter(struct intel_timeline *tl)
> return;
>
> spin_lock(&timelines->lock);
> - if (!atomic_fetch_inc(&tl->active_count))
> + if (!atomic_fetch_inc(&tl->active_count)) {
> + /*
> + * The HWSP is volatile, and may have been lost while inactive,
> + * e.g. across suspend/resume. Be paranoid, and ensure that
> + * the HWSP value matches our seqno so we don't proclaim
> + * the next request as already complete.
> + */
> + intel_timeline_reset_seqno(tl);
> list_add_tail(&tl->link, &timelines->active_list);
> + }
> spin_unlock(&timelines->lock);
> }
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.h b/drivers/gpu/drm/i915/gt/intel_timeline.h
> index f5b7eade3809..c8e59a333182 100644
> --- a/drivers/gpu/drm/i915/gt/intel_timeline.h
> +++ b/drivers/gpu/drm/i915/gt/intel_timeline.h
> @@ -84,6 +84,8 @@ int intel_timeline_get_seqno(struct intel_timeline *tl,
> void intel_timeline_exit(struct intel_timeline *tl);
> void intel_timeline_unpin(struct intel_timeline *tl);
>
> +void intel_timeline_reset_seqno(const struct intel_timeline *tl);
> +
> int intel_timeline_read_hwsp(struct i915_request *from,
> struct i915_request *until,
> u32 *hwsp_offset);
>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list