[Intel-gfx] [PATCH] drm/i915/selftests: Exercise CS TLB invalidation

Thu Sep 19 12:57:45 UTC 2019

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Check that we are correctly invalidating the TLB at the start of a
> batch after updating the GTT.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
> ---
>  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 227 ++++++++++++++++++
>  1 file changed, 227 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 598c18d10640..f8709b332bd7 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -25,13 +25,16 @@
>  #include <linux/list_sort.h>
>  #include <linux/prime_numbers.h>
>  
> +#include "gem/i915_gem_context.h"
>  #include "gem/selftests/mock_context.h"
> +#include "gt/intel_context.h"
>  
>  #include "i915_random.h"
>  #include "i915_selftest.h"
>  
>  #include "mock_drm.h"
>  #include "mock_gem_device.h"
> +#include "igt_flush_test.h"
>  
>  static void cleanup_freed_objects(struct drm_i915_private *i915)
>  {
> @@ -1705,6 +1708,229 @@ int i915_gem_gtt_mock_selftests(void)
>  	return err;
>  }
>  
> +static int context_sync(struct intel_context *ce)
> +{
> +	struct i915_request *rq;
> +	long timeout;
> +
> +	rq = intel_context_create_request(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	i915_request_get(rq);
> +	i915_request_add(rq);
> +
> +	timeout = i915_request_wait(rq, 0, HZ / 5);
> +	i915_request_put(rq);
> +
> +	return timeout < 0 ? -EIO : 0;
> +}
> +
> +static int submit_batch(struct intel_context *ce, u64 addr)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	rq = intel_context_create_request(ce);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	err = 0;
> +	if (rq->engine->emit_init_breadcrumb) /* detect a hang */

for seqno write?

> +		err = rq->engine->emit_init_breadcrumb(rq);
> +	if (err == 0)
> +		err = rq->engine->emit_bb_start(rq, addr, 0, 0);
> +

In context_sync part we grabbed a reference. In here we
don't. I don't see how we can get away without it even
if we don't wait in here.

> +	i915_request_add(rq);
> +
> +	return err;
> +}
> +
> +static int igt_cs_tlb(void *arg)
> +{
> +	const unsigned int count = PAGE_SIZE / 64;
> +	const unsigned int chunk_size = count * PAGE_SIZE;
> +	struct drm_i915_private *i915 = arg;
> +	struct drm_i915_gem_object *bbe, *out;
> +	struct i915_gem_engines_iter it;
> +	struct i915_address_space *vm;
> +	struct i915_gem_context *ctx;
> +	struct intel_context *ce;
> +	struct drm_file *file;
> +	struct i915_vma *vma;
> +	unsigned int i;
> +	u32 *result;
> +	u32 *batch;
> +	int err = 0;
> +
> +	file = mock_file(i915);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	ctx = live_context(i915, file);
> +	if (IS_ERR(ctx)) {
> +		err = PTR_ERR(ctx);
> +		goto out_unlock;
> +	}
> +
> +	vm = ctx->vm;
> +	if (!vm)
> +		goto out_unlock;
> +
> +	bbe = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(bbe)) {
> +		err = PTR_ERR(bbe);
> +		goto out_unlock;
> +	}
> +
> +	batch = i915_gem_object_pin_map(bbe, I915_MAP_WC);
> +	if (IS_ERR(batch)) {
> +		err = PTR_ERR(batch);
> +		goto out_bbe;
> +	}
> +	for (i = 0; i < count; i++) {
> +		u32 *cs = batch + i * 64 / sizeof(*cs);
> +		u64 addr = (vm->total - PAGE_SIZE) + i * sizeof(u32);
> +
> +		GEM_BUG_ON(INTEL_GEN(i915) < 6);
> +		cs[0] = MI_STORE_DWORD_IMM_GEN4;
> +		if (INTEL_GEN(i915) >= 8) {
> +			cs[1] = lower_32_bits(addr);
> +			cs[2] = upper_32_bits(addr);
> +			cs[3] = i;
> +			cs[4] = MI_NOOP;
> +			cs[5] = MI_BATCH_BUFFER_START_GEN8;
> +		} else {
> +			cs[1] = 0;
> +			cs[2] = lower_32_bits(addr);
> +			cs[3] = i;
> +			cs[4] = MI_NOOP;
> +			cs[5] = MI_BATCH_BUFFER_START;
> +		}
> +	}
> +
> +	out = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(out)) {
> +		err = PTR_ERR(out);
> +		goto out_batch;
> +	}
> +	i915_gem_object_set_cache_coherency(out, I915_CACHING_CACHED);
> +
> +	vma = i915_vma_instance(out, vm, NULL);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto out_batch;
> +	}
> +
> +	err = i915_vma_pin(vma, 0, 0,
> +			   PIN_USER |
> +			   PIN_OFFSET_FIXED |
> +			   (vm->total - PAGE_SIZE));
> +	if (err)
> +		goto out_out;

out_put?

Oh and we don't have to do anything with the instance on
error paths?

> +	GEM_BUG_ON(vma->node.start != vm->total - PAGE_SIZE);
> +
> +	result = i915_gem_object_pin_map(out, I915_MAP_WB);
> +	if (IS_ERR(result)) {
> +		err = PTR_ERR(result);
> +		goto out_out;
> +	}
> +
> +	for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) {
> +		IGT_TIMEOUT(end_time);
> +		unsigned long pass = 0;
> +
> +		if (!intel_engine_can_store_dword(ce->engine))
> +			continue;
> +
> +		while (!__igt_timeout(end_time, NULL)) {
> +			u64 offset;
> +
> +			offset = random_offset(0, vm->total - PAGE_SIZE,
> +					       chunk_size, PAGE_SIZE);
> +
> +			err = vm->allocate_va_range(vm, offset, chunk_size);
> +			if (err)
> +				goto end;
> +
> +			memset32(result, STACK_MAGIC, PAGE_SIZE / sizeof(u32));
> +
> +			vma = i915_vma_instance(bbe, vm, NULL);
> +			if (IS_ERR(vma)) {
> +				err = PTR_ERR(vma);
> +				goto end;
> +			}
> +
> +			err = vma->ops->set_pages(vma);
> +			if (err)
> +				goto end;
> +
> +			/* Replace the TLB with target batches */
> +			for (i = 0; i < count; i++) {
> +				u32 *cs = batch + i * 64 / sizeof(*cs);
> +				u64 addr;
> +
> +				vma->node.start = offset + i * PAGE_SIZE;

on previous loop, we have now primed the pte to tlb in here?

> +				vm->insert_entries(vm, vma, I915_CACHE_NONE, 0);

..now changing in it here...

> +
> +				addr = vma->node.start + i * 64;
> +				cs[4] = MI_NOOP;
> +				cs[6] = lower_32_bits(addr);
> +				cs[7] = upper_32_bits(addr);
> +				wmb();
> +
> +				err = submit_batch(ce, addr);

in hope that with this submission hardware will see the old one and
completely miss the store+bb start?

Perhaps rewiring a more dummy emit only to prove this case
is pushing it.

But I am curious to know if you did try it out by removing
the invalidate on the emits and managed to bring
out the missing writes?

-Mika

> +				if (err)
> +					goto end;
> +			}
> +
> +			yield();
> +			for (i = 0; i < count; i++)
> +				batch[i * 64 / sizeof(*batch) + 4] =
> +					MI_BATCH_BUFFER_END;
> +			wmb();
> +
> +			vma->ops->clear_pages(vma);
> +
> +			err = context_sync(ce);
> +			if (err) {
> +				pr_err("%s: writes timed out\n",
> +				       ce->engine->name);
> +				goto end;
> +			}
> +
> +			for (i = 0; i < count; i++) {
> +				if (result[i] != i) {
> +					pr_err("%s: Write lost on pass %lu, at offset %llx, index %d, found %x, expected %x\n",
> +					       ce->engine->name, pass,
> +					       offset, i, result[i], i);
> +					err = -EINVAL;
> +					goto end;
> +				}
> +			}
> +
> +			vm->clear_range(vm, offset, chunk_size);
> +			pass++;
> +		}
> +	}
> +end:
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +	i915_gem_context_unlock_engines(ctx);
> +	i915_gem_object_unpin_map(out);
> +out_out:
> +	i915_gem_object_put(out);
> +out_batch:
> +	i915_gem_object_unpin_map(bbe);
> +out_bbe:
> +	i915_gem_object_put(bbe);
> +out_unlock:
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	mock_file_free(i915, file);
> +	return err;
> +}
> +
>  int i915_gem_gtt_live_selftests(struct drm_i915_private *i915)
>  {
>  	static const struct i915_subtest tests[] = {
> @@ -1722,6 +1948,7 @@ int i915_gem_gtt_live_selftests(struct drm_i915_private *i915)
>  		SUBTEST(igt_ggtt_pot),
>  		SUBTEST(igt_ggtt_fill),
>  		SUBTEST(igt_ggtt_page),
> +		SUBTEST(igt_cs_tlb),
>  	};
>  
>  	GEM_BUG_ON(offset_in_page(i915->ggtt.vm.total));
> -- 
> 2.23.0