[igt-dev] [PATCH i-g-t v3] tests/i915/gem_ppgtt: verify GTT eviction with contended locks

Wed Dec 21 15:59:54 UTC 2022

On 12/20/2022 12:46 PM, Matthew Auld wrote:
> We should still be able to GTT evict objects during execbuf (old
> bindings can linger around), even if there is object lock contention. In
> the worst case the execbuf should just wait on the contented locks.
> Returning -ENOSPC smells like a regression from past behaviour, and
> seems to break userspace.
>
> v2:
>    - Add coverage for explicit softpin
>    - Add timeout for the spinner
> v3:
>    - Improve the test description
>
> References: https://gitlab.freedesktop.org/drm/intel/-/issues/7570
> Signed-off-by: Matthew Auld <matthew.auld at intel.com>
> Cc: Andrzej Hajda <andrzej.hajda at intel.com>
> Cc: Nirmoy Das <nirmoy.das at intel.com>
> Cc: Mani Milani <mani at chromium.org>
> ---
>   tests/i915/gem_ppgtt.c | 133 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 133 insertions(+)
>
> diff --git a/tests/i915/gem_ppgtt.c b/tests/i915/gem_ppgtt.c
> index 9673ce22..024e8d47 100644
> --- a/tests/i915/gem_ppgtt.c
> +++ b/tests/i915/gem_ppgtt.c
> @@ -255,6 +255,131 @@ static void flink_and_close(void)
>   	close(fd2);
>   }
>   
> +#define PAGE_SIZE 4096
> +
> +static uint32_t batch_create_size(int fd, uint64_t size)
> +{
> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
> +	uint32_t handle;
> +
> +	handle = gem_create(fd, size);
> +	gem_write(fd, handle, 0, &bbe, sizeof(bbe));
> +
> +	return handle;
> +}
> +
> +#define IGT_USE_ANY	0x1
> +#define IGT_USE_PINNED	0x2
> +static void upload(int fd, uint32_t handle, uint32_t in_fence, uint32_t ctx_id,
> +		   unsigned int flags)
> +{
> +	struct drm_i915_gem_exec_object2 exec[2] = {};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&exec),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx_id,
> +	};
> +
> +	if (in_fence) {
> +		execbuf.rsvd2 = in_fence;
> +		execbuf.flags = I915_EXEC_FENCE_IN;
> +	}
> +
> +	exec[0].handle = handle;
> +	exec[0].flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
> +
> +	if (flags & IGT_USE_PINNED)
> +		exec[0].flags |= EXEC_OBJECT_PINNED; /* offset = 0 */
> +
> +	if (flags & IGT_USE_ANY) {
> +		exec[0].flags |= EXEC_OBJECT_PAD_TO_SIZE;
> +		exec[0].pad_to_size = gem_aperture_size(fd);
> +	}
> +
> +	gem_execbuf(fd, &execbuf);
> +}
> +
> +static void shrink_vs_evict(unsigned int flags)
> +{
> +	const unsigned int nproc = sysconf(_SC_NPROCESSORS_ONLN) + 1;
> +	const uint64_t timeout_5s = 5000000000LL;

5*NSEC_PER_SEC would be nice.

> +	int fd = drm_open_driver(DRIVER_INTEL);
> +	uint64_t ahnd = get_reloc_ahnd(fd, 0);
> +	const intel_ctx_t *ctx_arr[nproc];
> +	igt_spin_t *spinner;
> +	uint32_t handle1;
> +	int i;
> +
> +	/*
> +	 * Try to simulate some nasty object lock contention during GTT
> +	 * eviction. Create a BO and bind across several different VMs.  Invoke
> +	 * the shrinker on that shared BO, followed by triggering GTT eviction
> +	 * across all VMs.  Both require the object lock to make forward
> +	 * progress when trying to unbind the BO, but the shrinker will be
> +	 * blocked by the spinner (until killed).  Once the spinner is killed
> +	 * the shrinker should be able to unbind the object and drop the object
> +	 * lock, and GTT eviction should eventually succeed. At no point should
> +	 * we see -ENOSPC from the execbuf, even if we can't currently grab the
> +	 * object lock.
> +	 */
> +
> +	igt_require(gem_uses_full_ppgtt(fd));
> +
> +	igt_drop_caches_set(fd, DROP_ALL);
> +
> +	handle1 = gem_create(fd, PAGE_SIZE);
> +
> +	spinner = igt_spin_new(fd,
> +			       .ahnd = ahnd,
> +			       .flags = IGT_SPIN_FENCE_OUT);
> +	igt_spin_set_timeout(spinner, timeout_5s);
> +
> +	/*
> +	 * Create several VMs to ensure we don't block on the same vm lock. The
> +	 * goal of the test is to ensure that object lock contention doesn't
> +	 * somehow result in -ENOSPC from execbuf, if we need to trigger GTT
> +	 * eviction.
> +	 */
> +	for (i = 0; i < nproc; i++) {
> +		ctx_arr[i] = intel_ctx_create(fd, NULL);
> +
> +		upload(fd, handle1, spinner->execbuf.rsvd2 >> 32,
> +		       ctx_arr[i]->id, flags);
> +	}
> +
> +	igt_fork(child, 1)
> +		igt_drop_caches_set(fd, DROP_ALL);
> +
> +	sleep(2); /* Give the shrinker time to find handle1 */
> +
> +	igt_fork(child, nproc) {
> +		uint32_t handle2 = gem_create(fd, PAGE_SIZE);
> +
> +		/*
> +		 * One of these forks will be stuck on the vm mutex, since the
> +		 * shrinker is holding it (along with the object lock) while
> +		 * trying to unbind the chosen vma, but is blocked by the
> +		 * spinner. The rest should only block waiting to grab the
> +		 * object lock for handle1, before then trying to GTT evict it
> +		 * from their respective vm. In either case the contention of
> +		 * the vm->mutex or object lock should never result in -ENOSPC
> +		 * or some other error.
> +		 */
> +		handle2 = batch_create_size(fd, PAGE_SIZE);

This can be

uint32_t handle2 = batch_create_size(fd, PAGE_SIZE);

Above gem_create seems unnecessary. With those

Reviewed-by: Nirmoy Das <nirmoy.das at intel.com>

> +
> +		upload(fd, handle2, 0, ctx_arr[child]->id, flags);
> +		gem_close(fd, handle2);
> +	}
> +
> +	igt_waitchildren();
> +	igt_spin_free(fd, spinner);
> +
> +	for (i = 0; i < nproc; i++)
> +		intel_ctx_destroy(fd, ctx_arr[i]);
> +
> +	gem_close(fd, handle1);
> +}
> +
>   static bool has_contexts(void)
>   {
>   	bool result;
> @@ -331,4 +456,12 @@ igt_main
>   
>   	igt_subtest("flink-and-close-vma-leak")
>   		flink_and_close();
> +
> +	igt_describe("Regression test to verify GTT eviction can't randomly fail due to object lock contention");
> +	igt_subtest_group {
> +		igt_subtest("shrink-vs-evict-any")
> +			shrink_vs_evict(IGT_USE_ANY);
> +		igt_subtest("shrink-vs-evict-pinned")
> +			shrink_vs_evict(IGT_USE_PINNED);
> +	}
>   }