[igt-dev] [PATCH i-g-t v4 3/4] test/perf: Add test for TGL OAR unit
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Thu Nov 21 17:26:25 UTC 2019
On Thu, Nov 21, 2019 at 03:26:58PM +0200, Lionel Landwerlin wrote:
>On 20/11/2019 02:07, Umesh Nerlige Ramappa wrote:
>>Add a test that measures work using MI-RPC for the specific context
>>without using reports from the OA buffer.
>>
>>Tigerlake introduces an OA unit that measures work specific to render
>>workloads. This means we do not have to rely on reports from the OA
>>buffer to normalize the reports obtained from MI REPORT PERF COUNT
>>anymore.
>>
>>v2:
>>- Add igt_drop_root to make the test run in non-privileged mode (Lionel)
>>- Move parameter to __perf_open inside the same function (Lionel)
>>
>>v3, v4:
>>- Add igt test description
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>---
>> tests/perf.c | 334 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 334 insertions(+)
>>
>>diff --git a/tests/perf.c b/tests/perf.c
>>index 506b64ec..b80a95c7 100644
>>--- a/tests/perf.c
>>+++ b/tests/perf.c
>>@@ -3600,6 +3600,334 @@ gen8_test_single_ctx_render_target_writes_a_counter(void)
>> } while (WEXITSTATUS(child_ret) == EAGAIN);
>> }
>>+static void gen12_single_ctx_helper(void)
>>+{
>>+ uint64_t properties[] = {
>>+ /* Have a random value here for the context id, but initialize
>>+ * it once you figure out the context ID for the work to be
>>+ * measured
>>+ */
>>+ DRM_I915_PERF_PROP_CTX_HANDLE, UINT64_MAX,
>>+
>>+ /* OA unit configuration:
>>+ * DRM_I915_PERF_PROP_SAMPLE_OA is no longer required for Gen12
>>+ * because the OAR unit increments counters only for the
>>+ * relevant context. No other parameters are needed since we do
>>+ * not rely on the OA buffer anymore to normalize the counter
>>+ * values.
>>+ */
>>+ DRM_I915_PERF_PROP_OA_METRICS_SET, test_metric_set_id,
>>+ DRM_I915_PERF_PROP_OA_FORMAT, test_oa_format,
>>+ };
>>+ struct drm_i915_perf_open_param param = {
>>+ .flags = I915_PERF_FLAG_FD_CLOEXEC,
>>+ .num_properties = ARRAY_SIZE(properties) / 2,
>>+ .properties_ptr = to_user_pointer(properties),
>>+ };
>>+ drm_intel_bufmgr *bufmgr;
>>+ drm_intel_context *context0, *context1;
>>+ struct intel_batchbuffer *batch;
>>+ struct igt_buf src[3], dst[3];
>>+ drm_intel_bo *bo;
>>+ uint32_t *report0_32, *report1_32, *report2_32, *report3_32;
>>+ uint64_t timestamp0_64, timestamp1_64;
>>+ uint32_t delta_ts64, delta_oa32;
>>+ uint64_t delta_ts64_ns, delta_oa32_ns;
>>+ uint32_t delta_delta;
>>+ int width = 800;
>>+ int height = 600;
>>+#define INVALID_CTX_ID 0xffffffff
>>+ uint32_t ctx0_id = INVALID_CTX_ID;
>>+ uint32_t ctx1_id = INVALID_CTX_ID;
>>+ int ret;
>>+ struct accumulator accumulator = {
>>+ .format = test_oa_format
>>+ };
>>+
>>+ bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
>>+ drm_intel_bufmgr_gem_enable_reuse(bufmgr);
>>+
>>+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
>>+ scratch_buf_init(bufmgr, &src[i], width, height, 0xff0000ff);
>>+ scratch_buf_init(bufmgr, &dst[i], width, height, 0x00ff00ff);
>>+ }
>>+
>>+ batch = intel_batchbuffer_alloc(bufmgr, devid);
>>+
>>+ context0 = drm_intel_gem_context_create(bufmgr);
>>+ igt_assert(context0);
>>+
>>+ context1 = drm_intel_gem_context_create(bufmgr);
>>+ igt_assert(context1);
>>+
>>+ igt_debug("submitting warm up render_copy\n");
>>+
>>+ /* Submit some early, unmeasured, work to the context we want
>>+ * to measure to try and catch issues with i915-perf
>>+ * initializing the HW context ID for filtering.
>>+ *
>>+ * We do this because i915-perf single context filtering had
>>+ * previously only relied on a hook into context pinning to
>>+ * initialize the HW context ID, instead of also trying to
>>+ * determine the HW ID while opening the stream, in case it
>>+ * has already been pinned.
>>+ *
>>+ * This wasn't noticed by the previous unit test because we
>>+ * were opening the stream while the context hadn't been
>>+ * touched or pinned yet and so it worked out correctly to wait
>>+ * for the pinning hook.
>>+ *
>>+ * Now a buggy version of i915-perf will fail to measure
>>+ * anything for context0 once this initial render_copy() ends
>>+ * up pinning the context since there won't ever be a pinning
>>+ * hook callback.
>>+ */
>>+ render_copy(batch, context0,
>>+ &src[0], 0, 0, width, height,
>>+ &dst[0], 0, 0);
>>+
>>+ /* Initialize the context parameter to the perf open ioctl here */
>>+ ret = drm_intel_gem_context_get_id(context0, &ctx0_id);
>>+ igt_assert_eq(ret, 0);
>>+ igt_assert_neq(ctx0_id, 0xffffffff);
>>+ properties[1] = ctx0_id;
>>+
>>+ igt_debug("opening i915-perf stream\n");
>>+ stream_fd = __perf_open(drm_fd, ¶m, false);
>>+
>>+ bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
>>+
>>+ /* Set write domain to cpu briefly to fill the buffer with 80s */
>>+ ret = drm_intel_bo_map(bo, true);
>>+ igt_assert_eq(ret, 0);
>>+ memset(bo->virtual, 0x80, 2048);
>>+ memset(bo->virtual + 2048, 0, 2048);
>>+ drm_intel_bo_unmap(bo);
>>+
>>+ /* Submit an mi-rpc to context0 before measurable work */
>>+#define BO_TIMESTAMP_OFFSET0 1024
>>+#define BO_REPORT_OFFSET0 0
>>+#define BO_REPORT_ID0 0xdeadbeef
>>+ emit_stall_timestamp_and_rpc(batch,
>>+ bo,
>>+ BO_TIMESTAMP_OFFSET0,
>>+ BO_REPORT_OFFSET0,
>>+ BO_REPORT_ID0);
>>+ intel_batchbuffer_flush_with_context(batch, context0);
>>+
>>+ /* This is the work/context that is measured for counter increments */
>>+ render_copy(batch, context0,
>>+ &src[0], 0, 0, width, height,
>>+ &dst[0], 0, 0);
>>+ intel_batchbuffer_flush_with_context(batch, context0);
>>+
>>+ /* Submit an mi-rpc to context1 before work
>>+ *
>>+ * On gen12, this measurement should just yield counters that are
>>+ * all zeroes, since the counters will only increment for the
>>+ * context passed to perf open ioctl
>>+ */
>>+#define BO_TIMESTAMP_OFFSET2 1040
>>+#define BO_REPORT_OFFSET2 512
>>+#define BO_REPORT_ID2 0x00c0ffee
>>+ emit_stall_timestamp_and_rpc(batch,
>>+ bo,
>>+ BO_TIMESTAMP_OFFSET2,
>>+ BO_REPORT_OFFSET2,
>>+ BO_REPORT_ID2);
>>+ intel_batchbuffer_flush_with_context(batch, context1);
>>+
>>+ /* Submit two copies on the other context to avoid a false
>>+ * positive in case the driver somehow ended up filtering for
>>+ * context1
>>+ */
>>+ render_copy(batch, context1,
>>+ &src[1], 0, 0, width, height,
>>+ &dst[1], 0, 0);
>>+ ret = drm_intel_gem_context_get_id(context1, &ctx1_id);
>>+ igt_assert_eq(ret, 0);
>>+ igt_assert_neq(ctx1_id, 0xffffffff);
>>+
>>+ render_copy(batch, context1,
>>+ &src[2], 0, 0, width, height,
>>+ &dst[2], 0, 0);
>>+ intel_batchbuffer_flush_with_context(batch, context1);
>>+
>>+ /* Submit an mi-rpc to context1 after all work */
>>+#define BO_TIMESTAMP_OFFSET3 1048
>>+#define BO_REPORT_OFFSET3 768
>>+#define BO_REPORT_ID3 0x01c0ffee
>>+ emit_stall_timestamp_and_rpc(batch,
>>+ bo,
>>+ BO_TIMESTAMP_OFFSET3,
>>+ BO_REPORT_OFFSET3,
>>+ BO_REPORT_ID3);
>>+ intel_batchbuffer_flush_with_context(batch, context1);
>>+
>>+ /* Submit an mi-rpc to context0 after all measurable work */
>>+#define BO_TIMESTAMP_OFFSET1 1032
>>+#define BO_REPORT_OFFSET1 256
>>+#define BO_REPORT_ID1 0xbeefbeef
>>+ emit_stall_timestamp_and_rpc(batch,
>>+ bo,
>>+ BO_TIMESTAMP_OFFSET1,
>>+ BO_REPORT_OFFSET1,
>>+ BO_REPORT_ID1);
>>+ intel_batchbuffer_flush_with_context(batch, context0);
>>+
>>+ /* Set write domain to none */
>>+ ret = drm_intel_bo_map(bo, false);
>>+ igt_assert_eq(ret, 0);
>>+
>>+ /* Sanity check reports
>>+ * reportX_32[0]: report id passed with mi-rpc
>>+ * reportX_32[1]: timestamp
>>+ * reportX_32[2]: context id
>>+ *
>>+ * report0_32: start of measurable work
>>+ * report1_32: end of measurable work
>>+ * report2_32: start of other work
>>+ * report3_32: end of other work
>>+ */
>>+ report0_32 = bo->virtual;
>>+ igt_assert_eq(report0_32[0], 0xdeadbeef);
>>+ igt_assert_neq(report0_32[1], 0);
>>+ ctx0_id = report0_32[2];
>>+ igt_debug("MI_RPC(start) CTX ID: %u\n", ctx0_id);
>>+ dump_report(report0_32, 64, "report0_32");
>>+
>>+ report1_32 = report0_32 + 64;
>>+ igt_assert_eq(report1_32[0], 0xbeefbeef);
>>+ igt_assert_neq(report1_32[1], 0);
>>+ ctx1_id = report1_32[2];
>>+ dump_report(report1_32, 64, "report1_32");
>>+
>>+ /* Verify that counters in context1 are all zeroes */
>>+ report2_32 = report0_32 + 128;
>>+ igt_assert_eq(report2_32[0], 0x00c0ffee);
>>+ igt_assert_neq(report2_32[1], 0);
>>+ dump_report(report2_32, 64, "report2_32");
>>+ igt_assert_eq(0, memcmp(&report2_32[4],
>>+ bo->virtual + 2048,
>>+ 240));
memcmp here ^
>>+
>>+ report3_32 = report0_32 + 192;
>>+ igt_assert_eq(report3_32[0], 0x01c0ffee);
>>+ igt_assert_neq(report3_32[1], 0);
>>+ dump_report(report3_32, 64, "report3_32");
>>+ igt_assert_eq(0, memcmp(&report3_32[4],
>>+ bo->virtual + 2048,
>>+ 240));
memcmp here ^
>>+
>>+ /* Accumulate deltas for counters - A0, A21 and A26 */
>>+ memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
>>+ accumulate_reports(&accumulator, report0_32, report1_32);
>>+ igt_debug("total: A0 = %"PRIu64", A21 = %"PRIu64", A26 = %"PRIu64"\n",
>>+ accumulator.deltas[2 + 0],
>>+ accumulator.deltas[2 + 21],
>>+ accumulator.deltas[2 + 26]);
>
>
>What delta do you see for the context that is not given to i915-perf?
no delta and also all counters are zeroes. The above 2 memcmp check that
all counters are zeroes.
Thanks,
Umesh
>
>Asking because I want to verify that we can actually allow non
>privileged context to open without other context seeing global values
>(i.e. non saved/restored) in OAR.
>
>
>-Lionel
>
>>+
>>+ igt_debug("oa_timestamp32 0 = %u\n", report0_32[1]);
>>+ igt_debug("oa_timestamp32 1 = %u\n", report1_32[1]);
>>+ igt_debug("ctx_id 0 = %u\n", report0_32[2]);
>>+ igt_debug("ctx_id 1 = %u\n", report1_32[2]);
>>+
>>+ /* The delta as calculated via the PIPE_CONTROL timestamp or
>>+ * the OA report timestamps should be almost identical but
>>+ * allow a 500 nanoseconds margin.
>>+ */
>>+ timestamp0_64 = *(uint64_t *)(((uint8_t *)bo->virtual) + BO_TIMESTAMP_OFFSET0);
>>+ timestamp1_64 = *(uint64_t *)(((uint8_t *)bo->virtual) + BO_TIMESTAMP_OFFSET1);
>>+
>>+ igt_debug("ts_timestamp64 0 = %"PRIu64"\n", timestamp0_64);
>>+ igt_debug("ts_timestamp64 1 = %"PRIu64"\n", timestamp1_64);
>>+
>>+ delta_ts64 = timestamp1_64 - timestamp0_64;
>>+ delta_oa32 = report1_32[1] - report0_32[1];
>>+
>>+ /* Sanity check that we can pass the delta to timebase_scale */
>>+ igt_assert(delta_ts64 < UINT32_MAX);
>>+ delta_oa32_ns = timebase_scale(delta_oa32);
>>+ delta_ts64_ns = timebase_scale(delta_ts64);
>>+
>>+ igt_debug("oa32 delta = %u, = %uns\n",
>>+ delta_oa32, (unsigned)delta_oa32_ns);
>>+ igt_debug("ts64 delta = %u, = %uns\n",
>>+ delta_ts64, (unsigned)delta_ts64_ns);
>>+
>>+ delta_delta = delta_ts64_ns > delta_oa32_ns ?
>>+ (delta_ts64_ns - delta_oa32_ns) :
>>+ (delta_oa32_ns - delta_ts64_ns);
>>+ if (delta_delta > 500) {
>>+ igt_debug("delta_delta exceeds margin, skipping..\n");
>>+ exit(EAGAIN);
>>+ }
>>+
>>+ igt_debug("n samples written = %"PRIu64"/%"PRIu64" (%ix%i)\n",
>>+ accumulator.deltas[2 + 21],
>>+ accumulator.deltas[2 + 26],
>>+ width, height);
>>+ accumulator_print(&accumulator, "filtered");
>>+
>>+ /* Verify that the work actually happened by comparing the src
>>+ * and dst buffers
>>+ */
>>+ ret = drm_intel_bo_map(src[0].bo, false);
>>+ igt_assert_eq(ret, 0);
>>+ ret = drm_intel_bo_map(dst[0].bo, false);
>>+ igt_assert_eq(ret, 0);
>>+
>>+ ret = memcmp(src[0].bo->virtual, dst[0].bo->virtual, 4 * width * height);
>>+ if (ret != 0) {
>>+ accumulator_print(&accumulator, "total");
>>+ exit(EAGAIN);
>>+ }
>>+
>>+ drm_intel_bo_unmap(src[0].bo);
>>+ drm_intel_bo_unmap(dst[0].bo);
>>+
>>+ /* Check that this test passed. The test measures the number of 2x2
>>+ * samples written to the render target using the counter A26. For
>>+ * OAR, this counter will only have increments relevant to this specific
>>+ * context. The value equals the width * height of the rendered work.
>>+ */
>>+ igt_assert_eq(accumulator.deltas[2 + 26], width * height);
>>+
>>+ /* Clean up */
>>+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
>>+ drm_intel_bo_unreference(src[i].bo);
>>+ drm_intel_bo_unreference(dst[i].bo);
>>+ }
>>+
>>+ drm_intel_bo_unmap(bo);
>>+ drm_intel_bo_unreference(bo);
>>+ intel_batchbuffer_free(batch);
>>+ drm_intel_gem_context_destroy(context0);
>>+ drm_intel_gem_context_destroy(context1);
>>+ drm_intel_bufmgr_destroy(bufmgr);
>>+ __perf_close(stream_fd);
>>+}
>>+
>>+static void
>>+gen12_test_single_ctx_render_target_writes_a_counter(void)
>>+{
>>+ int child_ret;
>>+ struct igt_helper_process child = {};
>>+
>>+ /* Ensure perf_stream_paranoid is set to 1 by default */
>>+ write_u64_file("/proc/sys/dev/i915/perf_stream_paranoid", 1);
>>+
>>+ do {
>>+ igt_fork_helper(&child) {
>>+ igt_drop_root();
>>+ gen12_single_ctx_helper();
>>+ }
>>+ child_ret = igt_wait_helper(&child);
>>+ igt_assert(WEXITSTATUS(child_ret) == EAGAIN ||
>>+ WEXITSTATUS(child_ret) == 0);
>>+ } while (WEXITSTATUS(child_ret) == EAGAIN);
>>+}
>>+
>> static unsigned long rc6_residency_ms(void)
>> {
>> return sysfs_read("power/rc6_residency_ms");
>>@@ -4225,6 +4553,12 @@ igt_main
>> gen8_test_single_ctx_render_target_writes_a_counter();
>> }
>>+ igt_describe("Measure performance for a specific context using OAR in Gen 12");
>>+ igt_subtest("gen12-unprivileged-single-ctx-counters") {
>>+ igt_require(intel_gen(devid) >= 12);
>>+ gen12_test_single_ctx_render_target_writes_a_counter();
>>+ }
>>+
>> igt_subtest("rc6-disable")
>> test_rc6_disable();
>
>
More information about the igt-dev
mailing list