[Intel-gfx] [PATCH v3 3/6] drm/i915 Implement LMEM backup and restore for suspend / resume
Matthew Auld
matthew.auld at intel.com
Fri Sep 17 12:03:54 UTC 2021
On 14/09/2021 20:31, Thomas Hellström wrote:
> Just evict unpinned objects to system. For pinned LMEM objects,
> make a backup system object and blit the contents to that.
>
> Backup is performed in three steps,
> 1: Opportunistically evict evictable objects using the gpu blitter.
> 2: After gt idle, evict evictable objects using the gpu blitter. This will
> be modified in an upcoming patch to backup pinned objects that are not used
> by the blitter itself.
> 3: Backup remaining pinned objects using memcpy.
>
> Also move uC suspend to after 2) to make sure we have a functional GuC
> during 2) if using GuC submission.
>
> v2:
> - Major refactor to make sure gem_exec_suspend at hang-SX subtests work, and
> suspend / resume works with a slightly modified GuC submission enabling
> patch series.
>
> v3:
> - Fix a potential use-after-free (Matthew Auld)
> - Use i915_gem_object_create_shmem() instead of
> i915_gem_object_create_region (Matthew Auld)
> - Minor simplifications (Matthew Auld)
> - Fix up kerneldoc for i195_ttm_restore_region().
> - Final lmem_suspend() call moved to i915_gem_backup_suspend from
> i915_gem_suspend_late, since the latter gets called at driver unload
> and we don't unnecessarily want to run it at that time.
>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
> drivers/gpu/drm/i915/Makefile | 1 +
> .../gpu/drm/i915/gem/i915_gem_object_types.h | 1 +
> drivers/gpu/drm/i915/gem/i915_gem_pm.c | 92 +++++++-
> drivers/gpu/drm/i915/gem/i915_gem_pm.h | 3 +-
> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 29 ++-
> drivers/gpu/drm/i915/gem/i915_gem_ttm.h | 10 +
> drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c | 203 ++++++++++++++++++
> drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.h | 24 +++
> drivers/gpu/drm/i915/gt/intel_gt_pm.c | 4 +-
> drivers/gpu/drm/i915/i915_drv.c | 10 +-
> drivers/gpu/drm/i915/i915_drv.h | 2 +-
> 11 files changed, 362 insertions(+), 17 deletions(-)
> create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.c
> create mode 100644 drivers/gpu/drm/i915/gem/i915_gem_ttm_pm.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 9d371be7dc5c..f9b69492a56c 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -154,6 +154,7 @@ gem-y += \
> gem/i915_gem_throttle.o \
> gem/i915_gem_tiling.o \
> gem/i915_gem_ttm.o \
> + gem/i915_gem_ttm_pm.o \
> gem/i915_gem_userptr.o \
> gem/i915_gem_wait.o \
> gem/i915_gemfs.o
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> index 2471f36aaff3..734cc8e16481 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
> @@ -534,6 +534,7 @@ struct drm_i915_gem_object {
> struct {
> struct sg_table *cached_io_st;
> struct i915_gem_object_page_iter get_io_page;
> + struct drm_i915_gem_object *backup;
> bool created:1;
> } ttm;
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> index 8b9d7d14c4bd..8736ae1dfbb2 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
> @@ -5,6 +5,7 @@
> */
>
> #include "gem/i915_gem_pm.h"
> +#include "gem/i915_gem_ttm_pm.h"
> #include "gt/intel_gt.h"
> #include "gt/intel_gt_pm.h"
> #include "gt/intel_gt_requests.h"
> @@ -39,7 +40,86 @@ void i915_gem_suspend(struct drm_i915_private *i915)
> i915_gem_drain_freed_objects(i915);
> }
>
> -void i915_gem_suspend_late(struct drm_i915_private *i915)
> +static int lmem_restore(struct drm_i915_private *i915, bool allow_gpu)
> +{
> + struct intel_memory_region *mr;
> + int ret = 0, id;
> +
> + for_each_memory_region(mr, i915, id) {
> + if (mr->type == INTEL_MEMORY_LOCAL) {
> + ret = i915_ttm_restore_region(mr, allow_gpu);
> + if (ret)
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int lmem_suspend(struct drm_i915_private *i915, bool allow_gpu,
> + bool backup_pinned)
> +{
> + struct intel_memory_region *mr;
> + int ret = 0, id;
> +
> + for_each_memory_region(mr, i915, id) {
> + if (mr->type == INTEL_MEMORY_LOCAL) {
> + ret = i915_ttm_backup_region(mr, allow_gpu, backup_pinned);
> + if (ret)
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static void lmem_recover(struct drm_i915_private *i915)
> +{
> + struct intel_memory_region *mr;
> + int id;
> +
> + for_each_memory_region(mr, i915, id)
> + if (mr->type == INTEL_MEMORY_LOCAL)
> + i915_ttm_recover_region(mr);
> +}
> +
> +int i915_gem_backup_suspend(struct drm_i915_private *i915)
> +{
> + int ret;
> +
> + /* Opportunistically try to evict unpinned objects */
> + ret = lmem_suspend(i915, true, false);
> + if (ret)
> + goto out_recover;
> +
> + i915_gem_suspend(i915);
> +
> + /*
> + * More objects may have become unpinned as requests were
> + * retired. Now try to evict again. The gt may be wedged here
> + * in which case we automatically fall back to memcpy.
> + */
> + ret = lmem_suspend(i915, true, false);
> + if (ret)
> + goto out_recover;
> +
> + /*
> + * Remaining objects are backed up using memcpy once we've stopped
> + * using the migrate context.
> + */
> + ret = lmem_suspend(i915, false, true);
> + if (ret)
> + goto out_recover;
> +
> + return 0;
> +
> +out_recover:
> + lmem_recover(i915);
> +
> + return ret;
> +}
> +
> +int i915_gem_suspend_late(struct drm_i915_private *i915)
> {
> struct drm_i915_gem_object *obj;
> struct list_head *phases[] = {
> @@ -83,6 +163,8 @@ void i915_gem_suspend_late(struct drm_i915_private *i915)
> spin_unlock_irqrestore(&i915->mm.obj_lock, flags);
> if (flush)
> wbinvd_on_all_cpus();
> +
> + return 0;
We can drop this change now?
I guess only slight concern is all the GEM_WARN_ON() instead of proper
error handling in some places, but hopefully these should never be hit
in practice,
Reviewed-by: Matthew Auld <matthew.auld at intel.com>
More information about the Intel-gfx
mailing list