[igt-dev] [PATCH i-g-t v4] i915/i915_pm_rps: Check impact of fence ordering on waitboosting

Thu Jul 28 14:04:59 UTC 2022

Hi,

On 28.07.2022 07:29, Kamil Konieczny wrote:
> Hi,
> 
> On 2022-07-27 at 14:27:02 +0200, Karolina Drobnik wrote:
>> Hi,
>>
>> On 26.07.2022 12:49, Kamil Konieczny wrote:
>>> Hi Karolina,
>>>
>>> few more nits, see below.
>>>
>>> On 2022-07-13 at 12:53:42 +0200, Karolina Drobnik wrote:
>>>> From: Chris Wilson <chris at chris-wilson.co.uk>
>>>>
>>>> Currently the wait boost heuristic is evaluated at the start of each
>>>> fence wait for a series within dma-resv. There is no strict ordering of
>>>> fences defined by dma-resv, and so it turns out that the same operation
>>>> under different circumstances can result in different heuristics being
>>>> applied, and dramatic performance variations in user applications.
>>>>
>>>> Link: https://gitlab.freedesktop.org/drm/intel/-/issues/6284
>>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>>> Signed-off-by: Karolina Drobnik <karolina.drobnik at intel.com>
>>>> ---
>>>> v4:
>>>>     - Added test descriptions with igt_describe()
>>>> v3:
>>>>     - Removed unnecessary calls to gem_write
>>>> v2:
>>>>     - Introduced batch_create() with arbitration points to prevent hangs
>>>>     - Added engine-order subtest
>>>>     - Added reporting of GPU frequency
>>>>     - Changed runtimes and switched prints from us to ms
>>>>
>>>>    tests/i915/i915_pm_rps.c | 261 ++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 255 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/tests/i915/i915_pm_rps.c b/tests/i915/i915_pm_rps.c
>>>> index b37f15c2..6125ea60 100644
>>>> --- a/tests/i915/i915_pm_rps.c
>>>> +++ b/tests/i915/i915_pm_rps.c
>>>> @@ -37,8 +37,10 @@
>>>>    #include <sys/wait.h>
>>>>
>>>>    #include "i915/gem.h"
>>>> +#include "i915/gem_create.h"
>>>>    #include "igt.h"
>>>>    #include "igt_dummyload.h"
>>>> +#include "igt_perf.h"
>>>>    #include "igt_sysfs.h"
>>>>
>>>>    IGT_TEST_DESCRIPTION("Render P-States tests - verify GPU frequency changes");
>>>> @@ -614,14 +616,253 @@ static void waitboost(int fd, bool reset)
>>>>    	igt_assert_lt(post_freqs[CUR], post_freqs[MAX]);
>>>>    }
>>>>
>>>> +static uint32_t batch_create(int i915, uint64_t sz)
>>>> +{
>>>> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
>>>> +	const uint32_t chk = 0x5 << 23;
>>>> +	uint32_t handle = gem_create(i915, sz);
>>>> +
>>>> +	for (uint64_t pg = 4096; pg + 4096 < sz; pg += 4096)
>>>> +		gem_write(i915, handle, pg, &chk, sizeof(chk));
>>>> +
>>>> +	gem_write(i915, handle, sz - sizeof(bbe), &bbe, sizeof(bbe));
>>>> +
>>>> +	return handle;
>>>> +}
>>>> +
>>>> +static uint64_t __fence_order(int i915,
>>>> +			      struct drm_i915_gem_exec_object2 *obj,
>>>> +			      struct drm_i915_gem_execbuffer2 *eb,
>>>> +			      uint64_t flags0, uint64_t flags1,
>>>> +			      double *outf)
>>>> +{
>>>> +	uint64_t before[2], after[2];
>>>> +	struct timespec tv;
>>>> +	int fd;
>>>> +
>>>> +	gem_quiescent_gpu(i915);
>>>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>>> +
>>>> +	igt_gettime(&tv);
>>>> +
>>>> +	obj->flags = flags0;
>>>> +	gem_execbuf(i915, eb);
>>>> +
>>>> +	obj->flags = flags1;
>>>> +	gem_execbuf(i915, eb);
>>>> +
>>>> +	read(fd, before, sizeof(before));
>>>> +	gem_sync(i915, obj->handle);
>>>> +	read(fd, after, sizeof(after));
>>>> +	close(fd);
>>>> +
>>>> +	after[0] -= before[0];
>>>> +	after[1] -= before[1];
>>>
>>> Put igt_assert here (check for zero before divide).
>>
>> Hmm, isn't that more of a belts and suspenders approach? It's unlikely we'd
>> get zero here. Also, don't we put asserts towards the end of the test
>> whenever possible?
> 
> It is userspace so we can afford some extra checks, so maybe
> 	if (after[1] == 0)
> 		return 0;

I agree that it won't cost much, but I just wonder if we're going to end 
up in such situation.

>>>> +
>>>> +	*outf = 1e9 * after[0] / after[1];
>>>> +	return igt_nsec_elapsed(&tv);
>>>> +}
>>>> +
>>>> +static void fence_order(int i915)
>>>> +{
>>>> +	const uint64_t sz = 512ull << 20;
>>>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>>>> +		{ .handle = gem_create(i915, 4096) },
>>>> +		{ .handle = batch_create(i915, sz) },
>>>> +	};
>>>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>>>> +		.buffers_ptr = to_user_pointer(obj),
>>>> +		.buffer_count = ARRAY_SIZE(obj),
>>>> +	};
>>>> +	uint64_t wr, rw;
>>>> +	int min, max;
>>>> +	double freq;
>>>> +	int sysfs;
>>>> +
>>>> +	/*
>>>> +	 * Check the order of fences found during GEM_WAIT does not affect
>>>> +	 * waitboosting.
>>>> +	 *
>>>> +	 * Internally, implicit fences are tracked within a dma-resv which
>>>> +	 * imposes no order on the individually fences tracked within. Since
>>>> +	 * there is no defined order, the sequence of waits (and the associated
>>>> +	 * waitboosts) is also undefined, undermining the consistency of the
>>>> +	 * waitboost heuristic.
>>>> +	 *
>>>> +	 * In particular, we can influence the sequence of fence storage
>>>> +	 * within dma-resv by mixing read/write semantics for implicit fences.
>>>> +	 * We can exploit this property of dma-resv to exercise that no matter
>>>> +	 * the stored order, the heuristic is applied consistently for the
>>>> +	 * user's GEM_WAIT ioctl.
>>>> +	 */
>>>> +
>>>> +	sysfs = igt_sysfs_open(i915);
>>>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>>> +	igt_require(max > min);
>>>> +
>>>> +	/* Only allow ourselves to upclock via waitboosting */
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>>> +
>>>> +	/* Warm up to bind the vma */
>>>> +	__fence_order(i915, &obj[0], &execbuf, 0, 0, &freq);
>>>> +
>>>> +	wr = __fence_order(i915, &obj[0], &execbuf, EXEC_OBJECT_WRITE, 0, &freq);
>>>> +	igt_info("Write-then-read: %.2fms @ %.3fMHz\n", wr * 1e-6, freq);
>>>> +
>>>> +	rw = __fence_order(i915, &obj[0], &execbuf, 0, EXEC_OBJECT_WRITE, &freq);
>>>> +	igt_info("Read-then-write: %.2fms @ %.3fMHz\n", rw * 1e-6, freq);
>>>> +
>>>> +	gem_close(i915, obj[0].handle);
>>>> +	gem_close(i915, obj[1].handle);
>>>
>>> Close sysfs here.
>>
>> Good catch. I'll close it before asserts (as agreed).
>>
> 
> My bad, this close should be after resetting max freq, two lines
> below. Btw why not reuse code from other tests, reading old freqs
> at beginnig of test or reusing freqs readed at fixture in
> igt_main ?

No worries. Hm, I'd say it's because of how the fixture is likely to 
change in the future.

>>>> +
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>>
>>> Here we should put previous values, not those readed from RPx,
>>> but if they are always the same it is ok.
>>
>> We restore the values we read from sysfs
>>
> 
> But we readed RPx values, not gt_min|gt_max freq ones, so here
> you can use origfreqs[MIN] and origfreqs[MAX] or declare in test
> 
> int old_freqs[NUMFREQ];
> 
> and read them with read_freqs(old_freqs) and then use for restore.
I'd keep it as it is

Many thanks,
Karolina

> Regards,
> Kamil
> 
>>>> +
>>>> +	igt_assert(4 * rw > 3 * wr && 4 * wr > 3 * rw);
>>>> +}
>>>> +
>>>> +static uint64_t __engine_order(int i915,
>>>> +			       struct drm_i915_gem_exec_object2 *obj,
>>>> +			       struct drm_i915_gem_execbuffer2 *eb,
>>>> +			       unsigned int *engines0,
>>>> +			       unsigned int *engines1,
>>>> +			       unsigned int num_engines,
>>>> +			       double *outf)
>>>> +{
>>>> +	uint64_t before[2], after[2];
>>>> +	struct timespec tv;
>>>> +	int fd;
>>>> +
>>>> +	gem_quiescent_gpu(i915);
>>>> +	fd = perf_i915_open(i915, I915_PMU_ACTUAL_FREQUENCY);
>>>> +
>>>> +	igt_gettime(&tv);
>>>> +
>>>> +	obj->flags = EXEC_OBJECT_WRITE;
>>>> +	for (unsigned int n = 0; n < num_engines; n++) {
>>>> +		eb->flags &= ~63ull;
>>>> +		eb->flags |= engines0[n];
>>>> +		gem_execbuf_wr(i915, eb);
>>>> +	}
>>>> +
>>>> +	obj->flags = 0;
>>>> +	for (unsigned int n = 0; n < num_engines; n++) {
>>>> +		eb->flags &= ~63ull;
>>>> +		eb->flags |= engines1[n];
>>>> +		gem_execbuf(i915, eb);
>>>> +	}
>>>> +
>>>> +	read(fd, before, sizeof(before));
>>>> +	gem_sync(i915, obj->handle);
>>>> +	read(fd, after, sizeof(after));
>>>> +	close(fd);
>>>> +
>>>> +	after[0] -= before[0];
>>>> +	after[1] -= before[1];
>>>
>>> Put igt_assert here (check for zero before divide).
>>>
>>>> +
>>>> +	*outf = 1e9 * after[0] / after[1];
>>>> +	return igt_nsec_elapsed(&tv);
>>>> +}
>>>> +
>>>> +static void engine_order(int i915)
>>>> +{
>>>> +	const uint64_t sz = 512ull << 20;
>>>> +	struct drm_i915_gem_exec_object2 obj[2] = {
>>>> +		{ .handle = gem_create(i915, 4096) },
>>>> +		{ .handle = batch_create(i915, sz) },
>>>> +	};
>>>> +	struct drm_i915_gem_execbuffer2 execbuf = {
>>>> +		.buffers_ptr = to_user_pointer(obj),
>>>> +		.buffer_count = ARRAY_SIZE(obj),
>>>> +	};
>>>> +	const struct intel_execution_engine2 *e;
>>>> +	unsigned int engines[2], reverse[2];
>>>> +	uint64_t forward, backward, both;
>>>> +	unsigned int num_engines;
>>>> +	const intel_ctx_t *ctx;
>>>> +	int min, max;
>>>> +	double freq;
>>>> +	int sysfs;
>>>> +
>>>> +	/*
>>>> +	 * Check the order of fences found during GEM_WAIT does not affect
>>>> +	 * waitboosting. (See fence_order())
>>>> +	 *
>>>> +	 * Another way we can manipulate the order of fences within the
>>>> +	 * dma-resv is through repeated use of the same contexts.
>>>> +	 */
>>>> +
>>>> +	num_engines = 0;
>>>> +	ctx = intel_ctx_create_all_physical(i915);
>>>> +	for_each_ctx_engine(i915, ctx, e) {
>>>> +		/*
>>>> +		 * Avoid using the cmdparser as it will try to allocate
>>>> +		 * a new shadow batch for each submission -> oom
>>>> +		 */
>>>> +		if (!gem_engine_has_mutable_submission(i915, e->class))
>>>> +			continue;
>>>> +
>>>> +		engines[num_engines++] = e->flags;
>>>> +		if (num_engines == ARRAY_SIZE(engines))
>>>> +			break;
>>>> +	}
>>>> +	igt_require(num_engines > 1);
>>>> +	for (unsigned int n = 0; n < num_engines; n++)
>>>> +		reverse[n] = engines[num_engines - n - 1];
>>>> +	execbuf.rsvd1 = ctx->id;
>>>> +
>>>> +	sysfs = igt_sysfs_open(i915);
>>>> +	min = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz");
>>>> +	max = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz");
>>>> +	igt_require(max > min);
>>>> +
>>>> +	/* Only allow ourselves to upclock via waitboosting */
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_boost_freq_mhz", "%d", max);
>>>> +
>>>> +	/* Warm up to bind the vma */
>>>> +	gem_execbuf(i915, &execbuf);
>>>> +
>>>> +	forward = __engine_order(i915, &obj[0], &execbuf,
>>>> +				 engines, engines, num_engines,
>>>> +				 &freq);
>>>> +	igt_info("Forwards: %.2fms @ %.3fMhz\n", forward * 1e-6, freq);
>>>> +
>>>> +	backward = __engine_order(i915, &obj[0], &execbuf,
>>>> +				  reverse, reverse, num_engines,
>>>> +				  &freq);
>>>> +	igt_info("Backwards: %.2fms @ %.3fMhz\n", backward * 1e-6, freq);
>>>> +
>>>> +	both = __engine_order(i915, &obj[0], &execbuf,
>>>> +			      engines, reverse, num_engines,
>>>> +			      &freq);
>>>> +	igt_info("Bidirectional: %.2fms @ %.3fMhz\n", both * 1e-6, freq);
>>>> +
>>>> +	gem_close(i915, obj[0].handle);
>>>> +	gem_close(i915, obj[1].handle);
>>>
>>> Close sysfs here.
>>
>> Ack
>>
>>>
>>>> +	intel_ctx_destroy(i915, ctx);
>>>> +
>>>> +	igt_sysfs_printf(sysfs, "gt_min_freq_mhz", "%d", min);
>>>> +	igt_sysfs_printf(sysfs, "gt_max_freq_mhz", "%d", max);
>>>> +
>>>> +	igt_assert(4 * forward > 3 * backward && 4 * backward > 3 * forward);
>>>> +	igt_assert(4 * forward > 3 * both && 4 * both > 3 * forward);
>>>> +}
>>>> +
>>>>    static void pm_rps_exit_handler(int sig)
>>>>    {
>>>> -	if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> -	} else {
>>>> -		writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> -		writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> +	if (sysfs_files[MAX].filp) {
>>>> +		if (origfreqs[MIN] > readval(sysfs_files[MAX].filp)) {
>>>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> +		} else {
>>>> +			writeval(sysfs_files[MIN].filp, origfreqs[MIN]);
>>>> +			writeval(sysfs_files[MAX].filp, origfreqs[MAX]);
>>>> +		}
>>>
>>> This change looks unrelated. Please split this as separate patch
>>> and post as a fix.
>>
>> Will do so
>>
>> Many thanks,
>> Karolina
>>
>>> Regards,
>>> Kamil
>>>
>>>>    	}
>>>>
>>>>    	if (lh.igt_proc.running)
>>>> @@ -683,6 +924,14 @@ igt_main
>>>>    	igt_subtest("waitboost")
>>>>    		waitboost(drm_fd, false);
>>>>
>>>> +	igt_describe("Check if the order of fences does not affect waitboosting");
>>>> +	igt_subtest("fence-order")
>>>> +		fence_order(drm_fd);
>>>> +
>>>> +	igt_describe("Check if context reuse does not affect waitboosting");
>>>> +	igt_subtest("engine-order")
>>>> +		engine_order(drm_fd);
>>>> +
>>>>    	/* Test boost frequency after GPU reset */
>>>>    	igt_subtest("reset") {
>>>>    		igt_hang_t hang = igt_allow_hang(drm_fd, 0, 0);
>>>> --
>>>> 2.25.1