[igt-dev] [PATCH i-g-t 2/3] test/perf: Add test for TGL OAR unit

Mon Nov 18 13:15:22 UTC 2019

Hey Umesh,

Just a few changes but otherwise it looks good.

Thanks,

-Lionel

On 12/11/2019 00:16, Umesh Nerlige Ramappa wrote:
> Add a test that measures work using MI-RPC for the specific context
> without using reports from the OA buffer.
>
> Tigerlake introduces an OA unit that measures work specific to render
> workloads. This means we do not have to rely on reports from the OA
> buffer to normalize the reports obtained from MI REPORT PERF COUNT
> anymore.
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> ---
>   tests/perf.c | 353 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 353 insertions(+)
>
> diff --git a/tests/perf.c b/tests/perf.c
> index 2b6be134..b439e5bd 100644
> --- a/tests/perf.c
> +++ b/tests/perf.c
> @@ -210,6 +210,19 @@ static uint32_t (*read_report_ticks)(uint32_t *report,
>   static void (*sanity_check_reports)(uint32_t *oa_report0, uint32_t *oa_report1,
>   				    enum drm_i915_oa_format format);
>   
> +static void
> +dump_report(const uint32_t *report, uint32_t size, const char *message) {
> +	uint32_t i;
> +	igt_debug("%s\n", message);
> +	for (i = 0; i < size; i += 4) {
> +		igt_debug("%08x %08x %08x %08x\n",
> +				report[i],
> +				report[i + 1],
> +				report[i + 2],
> +				report[i + 3]);
> +	}
> +}
> +

I would put all the debug traces in a separate patch preceding this one.

>   static struct oa_format
>   get_oa_format(enum drm_i915_oa_format format)
>   {
> @@ -874,6 +887,7 @@ init_sys_info(void)
>   	igt_assert_neq(devid, 0);
>   
>   	timestamp_frequency = get_cs_timestamp_frequency();
> +	igt_debug("timestamp_frequency = %lu\n", timestamp_frequency);
>   	igt_assert_neq(timestamp_frequency, 0);
>   
>   	if (IS_HASWELL(devid)) {
> @@ -1289,6 +1303,7 @@ read_2_oa_reports(int format_id,
>   			igt_assert_eq(header->size, sample_size);
>   
>   			report = (const void *)(header + 1);
> +			dump_report(report, 64, "oa-formats");
>   
>   			igt_debug("read report: reason = %x, timestamp = %x, exponent mask=%x\n",
>   				  report[0], report[1], exponent_mask);
> @@ -2856,6 +2871,7 @@ test_mi_rpc(void)
>   	igt_assert_eq(ret, 0);
>   
>   	report32 = bo->virtual;
> +	dump_report(report32, 64, "mi-rpc");
>   	igt_assert_eq(report32[0], 0xdeadbeef); /* report ID */
>   	igt_assert_neq(report32[1], 0); /* timestamp */
>   
> @@ -3335,11 +3351,13 @@ gen8_test_single_ctx_render_target_writes_a_counter(void)
>   			prev = report0_32;
>   			ctx_id = prev[2];
>   			igt_debug("MI_RPC(start) CTX ID: %u\n", ctx_id);
> +			dump_report(report0_32, 64, "report0_32");
>   
>   			report1_32 = report0_32 + 64; /* 64 uint32_t = 256bytes offset */
>   			igt_assert_eq(report1_32[0], 0xbeefbeef); /* report ID */
>   			igt_assert_neq(report1_32[1], 0); /* timestamp */
>   			ctx1_id = report1_32[2];
> +			dump_report(report1_32, 64, "report1_32");
>   
>   			memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
>   			accumulate_reports(&accumulator, report0_32, report1_32);
> @@ -3434,6 +3452,7 @@ gen8_test_single_ctx_render_target_writes_a_counter(void)
>   				igt_assert_eq(header->size, sample_size);
>   
>   				report = (void *)(header + 1);
> +				dump_report(report, 64, "OA report");
>   
>   				/* Don't expect zero for timestamps */
>   				igt_assert_neq(report[1], 0);
> @@ -3581,6 +3600,335 @@ gen8_test_single_ctx_render_target_writes_a_counter(void)
>   	} while (WEXITSTATUS(child_ret) == EAGAIN);
>   }
>   
> +static void gen12_single_ctx_helper(struct drm_i915_perf_open_param *param)
> +{
> +	uint64_t *ctx_id_ptr;
> +	drm_intel_bufmgr *bufmgr;
> +	drm_intel_context *context0, *context1;
> +	struct intel_batchbuffer *batch;
> +	struct igt_buf src[3], dst[3];
> +	drm_intel_bo *bo;
> +	uint32_t *report0_32, *report1_32, *report2_32, *report3_32;
> +	uint64_t timestamp0_64, timestamp1_64;
> +	uint32_t delta_ts64, delta_oa32;
> +	uint64_t delta_ts64_ns, delta_oa32_ns;
> +	uint32_t delta_delta;
> +	int width = 800;
> +	int height = 600;
> +#define INVALID_CTX_ID 0xffffffff
> +	uint32_t ctx0_id = INVALID_CTX_ID;
> +	uint32_t ctx1_id = INVALID_CTX_ID;
> +	int ret;
> +	struct accumulator accumulator = {
> +		.format = test_oa_format
> +	};
> +
> +	bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
> +	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
> +
> +	for (int i = 0; i < ARRAY_SIZE(src); i++) {
> +		scratch_buf_init(bufmgr, &src[i], width, height, 0xff0000ff);
> +		scratch_buf_init(bufmgr, &dst[i], width, height, 0x00ff00ff);
> +	}
> +
> +	batch = intel_batchbuffer_alloc(bufmgr, devid);
> +
> +	context0 = drm_intel_gem_context_create(bufmgr);
> +	igt_assert(context0);
> +
> +	context1 = drm_intel_gem_context_create(bufmgr);
> +	igt_assert(context1);
> +
> +	igt_debug("submitting warm up render_copy\n");
> +
> +	/* Submit some early, unmeasured, work to the context we want
> +	 * to measure to try and catch issues with i915-perf
> +	 * initializing the HW context ID for filtering.
> +	 *
> +	 * We do this because i915-perf single context filtering had
> +	 * previously only relied on a hook into context pinning to
> +	 * initialize the HW context ID, instead of also trying to
> +	 * determine the HW ID while opening the stream, in case it
> +	 * has already been pinned.
> +	 *
> +	 * This wasn't noticed by the previous unit test because we
> +	 * were opening the stream while the context hadn't been
> +	 * touched or pinned yet and so it worked out correctly to wait
> +	 * for the pinning hook.
> +	 *
> +	 * Now a buggy version of i915-perf will fail to measure
> +	 * anything for context0 once this initial render_copy() ends
> +	 * up pinning the context since there won't ever be a pinning
> +	 * hook callback.
> +	 */
> +	render_copy(batch, context0,
> +		    &src[0], 0, 0, width, height,
> +		    &dst[0], 0, 0);
> +
> +	/* Initialize the context parameter to the perf open ioctl here */
> +	ret = drm_intel_gem_context_get_id(context0, &ctx0_id);
> +	igt_assert_eq(ret, 0);
> +	igt_assert_neq(ctx0_id, 0xffffffff);
> +	ctx_id_ptr = (uint64_t *) from_user_pointer(param->properties_ptr);
> +	ctx_id_ptr[1] = ctx0_id;

This is a bit complicated, just move the struct drm_i915_perf_open_param 
param etc.. in this function.

> +
> +	igt_debug("opening i915-perf stream\n");
> +	stream_fd = __perf_open(drm_fd, param, false);
> +
> +	bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
> +
> +	/* Set write domain to cpu briefly to fill the buffer with 80s */
> +	ret = drm_intel_bo_map(bo, true);
> +	igt_assert_eq(ret, 0);
> +	memset(bo->virtual, 0x80, 2048);
> +	memset(bo->virtual + 2048, 0, 2048);
> +	drm_intel_bo_unmap(bo);
> +
> +	/* Submit an mi-rpc to context0 before measurable work */
> +#define BO_TIMESTAMP_OFFSET0 1024
> +#define BO_REPORT_OFFSET0 0
> +#define BO_REPORT_ID0 0xdeadbeef
> +	emit_stall_timestamp_and_rpc(batch,
> +				     bo,
> +				     BO_TIMESTAMP_OFFSET0,
> +				     BO_REPORT_OFFSET0,
> +				     BO_REPORT_ID0);
> +	intel_batchbuffer_flush_with_context(batch, context0);
> +
> +	/* This is the work/context that is measured for counter increments */
> +	render_copy(batch, context0,
> +		    &src[0], 0, 0, width, height,
> +		    &dst[0], 0, 0);
> +	intel_batchbuffer_flush_with_context(batch, context0);
> +
> +	/* Submit an mi-rpc to context1 before work
> +	 *
> +	 * On gen12, this measurement should just yield counters that are
> +	 * all zeroes, since the counters will only increment for the
> +	 * context passed to perf open ioctl
> +	 */
> +#define BO_TIMESTAMP_OFFSET2 1040
> +#define BO_REPORT_OFFSET2 512
> +#define BO_REPORT_ID2 0x00c0ffee
> +	emit_stall_timestamp_and_rpc(batch,
> +				     bo,
> +				     BO_TIMESTAMP_OFFSET2,
> +				     BO_REPORT_OFFSET2,
> +				     BO_REPORT_ID2);
> +	intel_batchbuffer_flush_with_context(batch, context1);
> +
> +	/* Submit two copies on the other context to avoid a false
> +	 * positive in case the driver somehow ended up filtering for
> +	 * context1
> +	 */
> +	render_copy(batch, context1,
> +		    &src[1], 0, 0, width, height,
> +		    &dst[1], 0, 0);
> +	ret = drm_intel_gem_context_get_id(context1, &ctx1_id);
> +	igt_assert_eq(ret, 0);
> +	igt_assert_neq(ctx1_id, 0xffffffff);
> +
> +	render_copy(batch, context1,
> +		    &src[2], 0, 0, width, height,
> +		    &dst[2], 0, 0);
> +	intel_batchbuffer_flush_with_context(batch, context1);
> +
> +	/* Submit an mi-rpc to context1 after all work */
> +#define BO_TIMESTAMP_OFFSET3 1048
> +#define BO_REPORT_OFFSET3 768
> +#define BO_REPORT_ID3 0x01c0ffee
> +	emit_stall_timestamp_and_rpc(batch,
> +				     bo,
> +				     BO_TIMESTAMP_OFFSET3,
> +				     BO_REPORT_OFFSET3,
> +				     BO_REPORT_ID3);
> +	intel_batchbuffer_flush_with_context(batch, context1);
> +
> +	/* Submit an mi-rpc to context0 after all measurable work */
> +#define BO_TIMESTAMP_OFFSET1 1032
> +#define BO_REPORT_OFFSET1 256
> +#define BO_REPORT_ID1 0xbeefbeef
> +	emit_stall_timestamp_and_rpc(batch,
> +				     bo,
> +				     BO_TIMESTAMP_OFFSET1,
> +				     BO_REPORT_OFFSET1,
> +				     BO_REPORT_ID1);
> +	intel_batchbuffer_flush_with_context(batch, context0);
> +
> +	/* Set write domain to none */
> +	ret = drm_intel_bo_map(bo, false);
> +	igt_assert_eq(ret, 0);
> +
> +	/* Sanity check reports
> +	 * reportX_32[0]: report id passed with mi-rpc
> +	 * reportX_32[1]: timestamp
> +	 * reportX_32[2]: context id
> +	 *
> +	 * report0_32: start of measurable work
> +	 * report1_32: end of measurable work
> +	 * report2_32: start of other work
> +	 * report3_32: end of other work
> +	 */
> +	report0_32 = bo->virtual;
> +	igt_assert_eq(report0_32[0], 0xdeadbeef);
> +	igt_assert_neq(report0_32[1], 0);
> +	ctx0_id = report0_32[2];
> +	igt_debug("MI_RPC(start) CTX ID: %u\n", ctx0_id);
> +	dump_report(report0_32, 64, "report0_32");
> +
> +	report1_32 = report0_32 + 64;
> +	igt_assert_eq(report1_32[0], 0xbeefbeef);
> +	igt_assert_neq(report1_32[1], 0);
> +	ctx1_id = report1_32[2];
> +	dump_report(report1_32, 64, "report1_32");
> +
> +	/* Verify that counters in context1 are all zeroes */
> +	report2_32 = report0_32 + 128;
> +	igt_assert_eq(report2_32[0], 0x00c0ffee);
> +	igt_assert_neq(report2_32[1], 0);
> +	dump_report(report2_32, 64, "report2_32");
> +	igt_assert_eq(0, memcmp(&report2_32[4],
> +				bo->virtual + 2048,
> +				240));
> +
> +	report3_32 = report0_32 + 192;
> +	igt_assert_eq(report3_32[0], 0x01c0ffee);
> +	igt_assert_neq(report3_32[1], 0);
> +	dump_report(report3_32, 64, "report3_32");
> +	igt_assert_eq(0, memcmp(&report3_32[4],
> +				bo->virtual + 2048,
> +				240));
> +
> +	/* Accumulate deltas for counters - A0, A21 and A26 */
> +	memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
> +	accumulate_reports(&accumulator, report0_32, report1_32);
> +	igt_debug("total: A0 = %"PRIu64", A21 = %"PRIu64", A26 = %"PRIu64"\n",
> +			accumulator.deltas[2 + 0],
> +			accumulator.deltas[2 + 21],
> +			accumulator.deltas[2 + 26]);
> +
> +	igt_debug("oa_timestamp32 0 = %u\n", report0_32[1]);
> +	igt_debug("oa_timestamp32 1 = %u\n", report1_32[1]);
> +	igt_debug("ctx_id 0 = %u\n", report0_32[2]);
> +	igt_debug("ctx_id 1 = %u\n", report1_32[2]);
> +
> +	/* The delta as calculated via the PIPE_CONTROL timestamp or
> +	 * the OA report timestamps should be almost identical but
> +	 * allow a 500 nanoseconds margin.
> +	 */
> +	timestamp0_64 = *(uint64_t *)(((uint8_t *)bo->virtual) + BO_TIMESTAMP_OFFSET0);
> +	timestamp1_64 = *(uint64_t *)(((uint8_t *)bo->virtual) + BO_TIMESTAMP_OFFSET1);
> +
> +	igt_debug("ts_timestamp64 0 = %"PRIu64"\n", timestamp0_64);
> +	igt_debug("ts_timestamp64 1 = %"PRIu64"\n", timestamp1_64);
> +
> +	delta_ts64 = timestamp1_64 - timestamp0_64;
> +	delta_oa32 = report1_32[1] - report0_32[1];
> +
> +	/* Sanity check that we can pass the delta to timebase_scale */
> +	igt_assert(delta_ts64 < UINT32_MAX);
> +	delta_oa32_ns = timebase_scale(delta_oa32);
> +	delta_ts64_ns = timebase_scale(delta_ts64);
> +
> +	igt_debug("oa32 delta = %u, = %uns\n",
> +			delta_oa32, (unsigned)delta_oa32_ns);
> +	igt_debug("ts64 delta = %u, = %uns\n",
> +			delta_ts64, (unsigned)delta_ts64_ns);
> +
> +	delta_delta = delta_ts64_ns > delta_oa32_ns ?
> +		      (delta_ts64_ns - delta_oa32_ns) :
> +		      (delta_oa32_ns - delta_ts64_ns);
> +	if (delta_delta > 500) {
> +		igt_debug("delta_delta exceeds margin, skipping..\n");
> +		exit(EAGAIN);
> +	}
> +
> +	igt_debug("n samples written = %"PRIu64"/%"PRIu64" (%ix%i)\n",
> +		  accumulator.deltas[2 + 21],
> +		  accumulator.deltas[2 + 26],
> +		  width, height);
> +	accumulator_print(&accumulator, "filtered");
> +
> +	/* Verify that the work actually happened by comparing the src
> +	 * and dst buffers
> +	 */
> +	ret = drm_intel_bo_map(src[0].bo, false);
> +	igt_assert_eq(ret, 0);
> +	ret = drm_intel_bo_map(dst[0].bo, false);
> +	igt_assert_eq(ret, 0);
> +
> +	ret = memcmp(src[0].bo->virtual, dst[0].bo->virtual, 4 * width * height);
> +	if (ret != 0) {
> +		accumulator_print(&accumulator, "total");
> +		exit(EAGAIN);
> +	}
> +
> +	drm_intel_bo_unmap(src[0].bo);
> +	drm_intel_bo_unmap(dst[0].bo);
> +
> +	/* Check that this test passed. The test measures the number of 2x2
> +	 * samples written to the render target using the counter A26. For
> +	 * OAR, this counter will only have increments relevant to this specific
> +	 * context. The value equals the width * height of the rendered work.
> +	 */
> +	igt_assert_eq(accumulator.deltas[2 + 26], width * height);
> +
> +	/* Clean up */
> +	for (int i = 0; i < ARRAY_SIZE(src); i++) {
> +		drm_intel_bo_unreference(src[i].bo);
> +		drm_intel_bo_unreference(dst[i].bo);
> +	}
> +
> +	drm_intel_bo_unmap(bo);
> +	drm_intel_bo_unreference(bo);
> +	intel_batchbuffer_free(batch);
> +	drm_intel_gem_context_destroy(context0);
> +	drm_intel_gem_context_destroy(context1);
> +	drm_intel_bufmgr_destroy(bufmgr);
> +	__perf_close(stream_fd);
> +}
> +
> +static void
> +gen12_test_single_ctx_render_target_writes_a_counter(void)
> +{
> +	uint64_t properties[] = {
> +		/* Have a random value here for the context id, but initialize
> +		 * it once you figure out the context ID for the work to be
> +		 * measured
> +		 */
> +		DRM_I915_PERF_PROP_CTX_HANDLE, UINT64_MAX,
> +
> +		/* OA unit configuration:
> +		 * DRM_I915_PERF_PROP_SAMPLE_OA is no longer required for Gen12
> +		 * because the OAR unit increments counters only for the
> +		 * relevant context. No other parameters are needed since we do
> +		 * not rely on the OA buffer anymore to normalize the counter
> +		 * values.
> +		 */
> +		DRM_I915_PERF_PROP_OA_METRICS_SET, test_metric_set_id,
> +		DRM_I915_PERF_PROP_OA_FORMAT, test_oa_format,
> +	};
> +	struct drm_i915_perf_open_param param = {
> +		.flags = I915_PERF_FLAG_FD_CLOEXEC,
> +		.num_properties = ARRAY_SIZE(properties) / 2,
> +		.properties_ptr = to_user_pointer(properties),
> +	};
> +	int child_ret;
> +	struct igt_helper_process child = {};
> +
> +	/* Ensure perf_stream_paranoid is set to 1 by default */
> +	write_u64_file("/proc/sys/dev/i915/perf_stream_paranoid", 1);
> +
> +	do {
> +		igt_fork_helper(&child) {

You need a call to igt_drop_root() here if you're checking the paranoid 
mode.

Otherwise the child process is still root.

> +			gen12_single_ctx_helper(&param);
> +		}
> +		child_ret = igt_wait_helper(&child);
> +		igt_assert(WEXITSTATUS(child_ret) == EAGAIN ||
> +			   WEXITSTATUS(child_ret) == 0);
> +	} while (WEXITSTATUS(child_ret) == EAGAIN);
> +}
> +
>   static unsigned long rc6_residency_ms(void)
>   {
>   	return sysfs_read("power/rc6_residency_ms");
> @@ -4206,6 +4554,11 @@ igt_main
>   		gen8_test_single_ctx_render_target_writes_a_counter();
>   	}
>   
> +	igt_subtest("gen12-unprivileged-single-ctx-counters") {
> +		igt_require(intel_gen(devid) >= 12);
> +		gen12_test_single_ctx_render_target_writes_a_counter();
> +	}
> +
>   	igt_subtest("rc6-disable")
>   		test_rc6_disable();
>