[Intel-gfx] [PATCH 18/23] drm/i915/selftests: huge page tests

Mon Aug 28 14:36:41 UTC 2017

Excuse the piggy-backing, the original never arrived!

+static int cpu_check(struct drm_i915_gem_object *obj, u32 dword, u32 val)
+{
+	enum i915_map_type level;
+	int err;
+
+	for (level = I915_MAP_WB; level <= I915_MAP_WC; level++) {
+		u32 *map, offset;
+
+		if (level == I915_MAP_WB)
+			err = i915_gem_object_set_to_cpu_domain(obj, false);
+		else
+			err = i915_gem_object_set_to_wc_domain(obj, false);
+		if (err)
+			return err;
+
+		unmap_mapping(obj);
+		map = i915_gem_object_pin_map(obj, level);
+		if (IS_ERR(map))
+			return PTR_ERR(map);
+
+		for (offset = dword; offset < obj->base.size/sizeof(u32);
+		     offset += DWORDS_PER_PAGE) {
+			if (map[offset] != val) {
+				pr_err("map[%u] = %u, expected %u\n",
+				       offset, map[offset], val);
+				err = -EINVAL;
+				goto out_close;
+			}
+		}
+
+		i915_gem_object_unpin_map(obj);
+	}
+
+	return 0;
+
+out_close:
+	i915_gem_object_unpin_map(obj);
+
+	return err;
+}

We are testing the GTT layout, so we only need a single CPU check. Using
WC is going to be terrible in performance, and we really don't need to
vmap the whole thing either as we are just reading one dword per page.
Just a kmap and clflush (with a wait_for_rendering) is going to be a lot
quicker.

+static struct i915_vma *
+gpu_write_dw(struct i915_vma *vma, u64 offset, u32 val)
+{
+	struct drm_i915_private *i915 = to_i915(vma->obj->base.dev);
+	const int gen = INTEL_GEN(vma->vm->i915);
+	unsigned int count = vma->size >> PAGE_SHIFT;
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *batch;
+	unsigned int size;
+	u32 *cmd;
+	int n;
+	int err;
+
+	size = 1 + 4 * count * sizeof(u32);

Should be (1 + 4 * count) * sizeof(u32); Likelihood of it mattering, 0.

+static int gpu_write(struct i915_vma *vma,
+		     struct i915_gem_context *ctx,
+		     u32 dword,
+		     u32 value)
+{
...
+	err = rq->engine->emit_bb_start(rq,
+			batch->node.start, batch->node.size,
+			flags);
+	if (err)
+		goto err_request;
+
+	i915_vma_move_to_active(vma, rq, 0);

Should pass EXEC_OBJECT_WRITE. 

+	reservation_object_lock(vma->resv, NULL);
+	reservation_object_add_excl_fence(vma->resv, &rq->fence);
+	reservation_object_unlock(vma->resv);

And remind me to finish moving this export to i915_vma_move_to_active(),
just need to be sure that we can't create any dependency loops.

+
+err_request:
+	__i915_add_request(rq, err == 0);
+
+	return err;
+}

+static int igt_write_huge(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
+	unsigned long supported = INTEL_INFO(i915)->page_size_mask;
+	struct i915_gem_context *ctx = i915->kernel_context;
+	struct i915_address_space *vm = ctx->ppgtt ? &ctx->ppgtt->base : &i915->ggtt.base;
+	unsigned int flags = PIN_USER | PIN_OFFSET_FIXED;
+	struct i915_vma *vma;
+	int bit, last;
+	int err;
+
+	GEM_BUG_ON(obj->base.size != SZ_2M);
+
+	err = i915_gem_object_pin_pages(obj);
+	if (err)
+		return err;
+
+	/* We want to run the test even if the platform doesn't support huge gtt
+	 * pages -- our only requirement is that we were able to allocate a
+	 * "huge-page".
+	 */
+	if (obj->mm.page_sizes.phys < I915_GTT_PAGE_SIZE_2M) {
+		pr_info("Unable to allocate huge-page, finishing test early\n");
+		goto out_unpin;
+	}
+
+	vma = i915_vma_instance(obj, vm, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto out_unpin;
+	}
+
+	last = ilog2(I915_GTT_PAGE_SIZE_2M);
+
+	for_each_set_bit(bit, &supported, last + 1) {
+		IGT_TIMEOUT(end_time);
+		unsigned int page_size = BIT(bit);
+		u32 max = vm->total / I915_GTT_PAGE_SIZE_2M - 1;
+		u32 num;
+
+		/* Force the page size */
+		vma->page_sizes.sg = obj->mm.page_sizes.sg = page_size;
+
+		/* Try various offsets until we timeout -- we want to avoid
+		 * issues hidden by effectively always using offset = 0.
+		 */
+		for_each_prime_number_from(num, 0, max) {
...
+
+			for (dword = 0; dword < DWORDS_PER_PAGE; ++dword) {

We want to focus on the GTT testing, so checking every offset in the page
is less important than checking the various GTT offsets.

If you use dword = offset_in_page(num) / 4 (rather than the loop), we will
maintain the good checking that a write into a different page through the
GTT ends up where we expect, whilst exercising different cachelines on each
pass (and so be able to check 1024 times more GTT offsets).

+				err = gpu_write(vma, ctx, dword, num + 1);
+				if (err) {
+					pr_err("gpu_write failed with page-size %x\n",
+					       page_size);
+					i915_vma_unpin(vma);
+					goto out_close;
+				}

It looks like there is still a preponderance of GEM_BUG_ON -- these
should only be used for test bugs, we do want to clearly report errors
in the driver without oopsing. That also means we need a lot of pr_*()
giving all the vital information we need.

I'm still missing live testing that mixes page sizes within an object,
or at least I couldn't see ones that quell my qualms over how we
populate the GTT (with the goal of late optimisations). A sketch I have
in my head is that we stitch together our objects using alloc_huge_page
given a permutation mask. Basically, I want to know that 1G, 1G + 4K,
4K + 1G, 1G, 64K*32, 64K*31+16*4K etc all work.
-Chris