[Intel-gfx] [PATCH v2 7/9] drm/i915/gt: Pipelined page migration

Matthew Auld matthew.auld at intel.com
Wed Jun 9 12:14:56 UTC 2021


On 09/06/2021 07:34, Thomas Hellström wrote:
> From: Chris Wilson <chris at chris-wilson.co.uk>
> 
> If we pipeline the PTE updates and then do the copy of those pages
> within a single unpreemptible command packet, we can submit the copies
> and leave them to be scheduled without having to synchronously wait
> under a global lock. In order to manage migration, we need to
> preallocate the page tables (and keep them pinned and available for use
> at any time), causing a bottleneck for migrations as all clients must
> contend on the limited resources. By inlining the ppGTT updates and
> performing the blit atomically, each client only owns the PTE while in
> use, and so we can reschedule individual operations however we see fit.
> And most importantly, we do not need to take a global lock on the shared
> vm, and wait until the operation is complete before releasing the lock
> for others to claim the PTE for themselves.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Co-developed-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> Signed-off-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
> ---
> v2:
> - Add a TODO for huge LMEM ptes (Pointed out by Matthew Auld)
> - Use intel_engine_destroy_pinned_context() to properly take the pinned
>    context timeline off the engine list. (CI warning).
> ---
>   drivers/gpu/drm/i915/Makefile                 |   1 +
>   drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
>   drivers/gpu/drm/i915/gt/intel_gpu_commands.h  |   2 +
>   drivers/gpu/drm/i915/gt/intel_migrate.c       | 544 ++++++++++++++++++
>   drivers/gpu/drm/i915/gt/intel_migrate.h       |  45 ++
>   drivers/gpu/drm/i915/gt/intel_migrate_types.h |  15 +
>   drivers/gpu/drm/i915/gt/intel_ring.h          |   1 +
>   drivers/gpu/drm/i915/gt/selftest_migrate.c    | 291 ++++++++++
>   .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>   9 files changed, 901 insertions(+)
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate_types.h
>   create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 16a5a006cf7c..95bd38e84625 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -108,6 +108,7 @@ gt-y += \
>   	gt/intel_gtt.o \
>   	gt/intel_llc.o \
>   	gt/intel_lrc.o \
> +	gt/intel_migrate.o \
>   	gt/intel_mocs.o \
>   	gt/intel_ppgtt.o \
>   	gt/intel_rc6.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> index 36ea9eb52bb5..62f7440bc111 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>   #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
>   #define I915_GEM_HWS_SEQNO		0x40
>   #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
> +#define I915_GEM_HWS_MIGRATE		(0x42 * sizeof(u32))
>   #define I915_GEM_HWS_SCRATCH		0x80
>   
>   #define I915_HWS_CSB_BUF0_INDEX		0x10
> diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> index 2694dbb9967e..1c3af0fc0456 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> @@ -123,8 +123,10 @@
>   #define   MI_SEMAPHORE_SAD_NEQ_SDD	(5 << 12)
>   #define   MI_SEMAPHORE_TOKEN_MASK	REG_GENMASK(9, 5)
>   #define   MI_SEMAPHORE_TOKEN_SHIFT	5
> +#define MI_STORE_DATA_IMM	MI_INSTR(0x20, 0)
>   #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
>   #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
> +#define MI_STORE_QWORD_IMM_GEN8 (MI_INSTR(0x20, 3) | REG_BIT(21))
>   #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
>   #define   MI_USE_GGTT		(1 << 22) /* g4x+ */
>   #define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> new file mode 100644
> index 000000000000..70776316863d
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -0,0 +1,544 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2020 Intel Corporation
> + */
> +
> +#include "i915_drv.h"
> +#include "intel_context.h"
> +#include "intel_gpu_commands.h"
> +#include "intel_gt.h"
> +#include "intel_gtt.h"
> +#include "intel_migrate.h"
> +#include "intel_ring.h"
> +
> +struct insert_pte_data {
> +	u64 offset;
> +	bool is_lmem;
> +};
> +
> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> +
> +static bool engine_supports_migration(struct intel_engine_cs *engine)
> +{
> +	if (!engine)
> +		return false;
> +
> +	/*
> +	 * We need the ability to prevent aribtration (MI_ARB_ON_OFF),
> +	 * the ability to write PTE using inline data (MI_STORE_DATA)
> +	 * and of course the ability to do the block transfer (blits).
> +	 */
> +	GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
> +
> +	return true;
> +}
> +
> +static void insert_pte(struct i915_address_space *vm,
> +		       struct i915_page_table *pt,
> +		       void *data)
> +{
> +	struct insert_pte_data *d = data;
> +
> +	vm->insert_page(vm, px_dma(pt), d->offset, I915_CACHE_NONE,
> +			d->is_lmem ? PTE_LM : 0);
> +	d->offset += PAGE_SIZE;
> +}
> +
> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
> +{
> +	struct i915_vm_pt_stash stash = {};
> +	struct i915_ppgtt *vm;
> +	int err;
> +	int i;
> +
> +	/*
> +	 * We construct a very special VM for use by all migration contexts,
> +	 * it is kept pinned so that it can be used at any time. As we need
> +	 * to pre-allocate the page directories for the migration VM, this
> +	 * limits us to only using a small number of prepared vma.
> +	 *
> +	 * To be able to pipeline and reschedule migration operations while
> +	 * avoiding unnecessary contention on the vm itself, the PTE updates
> +	 * are inline with the blits. All the blits use the same fixed
> +	 * addresses, with the backing store redirection being updated on the
> +	 * fly. Only 2 implicit vma are used for all migration operations.
> +	 *
> +	 * We lay the ppGTT out as:
> +	 *
> +	 *	[0, CHUNK_SZ) -> first object
> +	 *	[CHUNK_SZ, 2 * CHUNK_SZ) -> second object
> +	 *	[2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
> +	 *
> +	 * By exposing the dma addresses of the page directories themselves
> +	 * within the ppGTT, we are then able to rewrite the PTE prior to use.
> +	 * But the PTE update and subsequent migration operation must be atomic,
> +	 * i.e. within the same non-preemptible window so that we do not switch
> +	 * to another migration context that overwrites the PTE.
> +	 *
> +	 * TODO: Add support for huge LMEM PTEs
> +	 */
> +
> +	vm = i915_ppgtt_create(gt);
> +	if (IS_ERR(vm))
> +		return ERR_CAST(vm);
> +
> +	if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
> +		err = -ENODEV;
> +		goto err_vm;
> +	}
> +
> +	/*
> +	 * Each engine instance is assigned its own chunk in the VM, so
> +	 * that we can run multiple instances concurrently
> +	 */
> +	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
> +		struct intel_engine_cs *engine;
> +		u64 base = (u64)i << 32;
> +		struct insert_pte_data d = {};
> +		struct i915_gem_ww_ctx ww;
> +		u64 sz;
> +
> +		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
> +		if (!engine_supports_migration(engine))
> +			continue;
> +
> +		/*
> +		 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
> +		 * 4x2 page directories for source/destination.
> +		 */
> +		sz = 2 * CHUNK_SZ;
> +		d.offset = base + sz;
> +
> +		/*
> +		 * We need another page directory setup so that we can write
> +		 * the 8x512 PTE in each chunk.
> +		 */
> +		sz += (sz >> 12) * sizeof(u64);

That was quite the mind warp.

> +
> +		err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
> +		if (err)
> +			goto err_vm;
> +
> +		for_i915_gem_ww(&ww, err, true) {
> +			err = i915_vm_lock_objects(&vm->vm, &ww);
> +			if (err)
> +				continue;
> +			err = i915_vm_map_pt_stash(&vm->vm, &stash);
> +			if (err)
> +				continue;
> +
> +			vm->vm.allocate_va_range(&vm->vm, &stash, base, base + sz);

I think here we meant:
allocate_va_range(.., base, sz)

Otherwise this might explode for instance > 0? Since the preallocated 
stash is too small.

> +		}
> +		i915_vm_free_pt_stash(&vm->vm, &stash);
> +		if (err)
> +			goto err_vm;
> +
> +		/* Now allow the GPU to rewrite the PTE via its own ppGTT */
> +		d.is_lmem = i915_gem_object_is_lmem(vm->vm.scratch[0]);
> +		vm->vm.foreach(&vm->vm, base, base + sz, insert_pte, &d);
> +	}
> +
> +	return &vm->vm;
> +
> +err_vm:
> +	i915_vm_put(&vm->vm);
> +	return ERR_PTR(err);
> +}
> +


More information about the Intel-gfx mailing list