[PATCH i-g-t 16/28] tests/intel/xe_oa: OAR/OAC tests
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Thu Jun 20 23:46:36 UTC 2024
On Thu, Jun 20, 2024 at 01:00:41PM -0700, Ashutosh Dixit wrote:
>"mi-rpc", "oa-tlb-invalidate" and "unprivileged-single-ctx-counters" tests.
>
>Signed-off-by: Ashutosh Dixit <ashutosh.dixit at intel.com>
Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>---
> tests/intel/xe_oa.c | 698 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 698 insertions(+)
>
>diff --git a/tests/intel/xe_oa.c b/tests/intel/xe_oa.c
>index d7f00e78b9..c4a45568aa 100644
>--- a/tests/intel/xe_oa.c
>+++ b/tests/intel/xe_oa.c
>@@ -597,6 +597,18 @@ read_report_reason(const uint32_t *report)
> return "unknown";
> }
>
>+static uint32_t
>+cs_timestamp_frequency(int fd)
>+{
>+ return xe_gt_list(drm_fd)->gt_list[0].reference_clock;
>+}
>+
>+static uint64_t
>+cs_timebase_scale(uint32_t u32_delta)
>+{
>+ return ((uint64_t)u32_delta * NSEC_PER_SEC) / cs_timestamp_frequency(drm_fd);
>+}
>+
> static uint64_t
> oa_timestamp(const uint32_t *report, enum intel_xe_oa_format_name format)
> {
>@@ -663,6 +675,15 @@ oa_report_get_ctx_id(uint32_t *report)
> return report[2];
> }
>
>+static int
>+oar_unit_default_format(void)
>+{
>+ if (IS_DG2(devid) || IS_METEORLAKE(devid))
>+ return XE_OAR_FORMAT_A32u40_A4u32_B8_C8;
>+
>+ return default_test_set->perf_oa_format;
>+}
>+
> static void *buf_map(int fd, struct intel_buf *buf, bool write)
> {
> void *p;
>@@ -701,6 +722,21 @@ scratch_buf_init(struct buf_ops *bops,
> scratch_buf_memset(buf, width, height, color);
> }
>
>+static void
>+emit_report_perf_count(struct intel_bb *ibb,
>+ struct intel_buf *dst,
>+ int dst_offset,
>+ uint32_t report_id)
>+{
>+ intel_bb_add_intel_buf(ibb, dst, true);
>+
>+ intel_bb_out(ibb, OA_MI_REPORT_PERF_COUNT);
>+ intel_bb_emit_reloc(ibb, dst->handle,
>+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
>+ dst_offset, dst->addr.offset);
>+ intel_bb_out(ibb, report_id);
>+}
>+
> static bool
> oa_report_is_periodic(uint32_t oa_exponent, const uint32_t *report)
> {
>@@ -741,6 +777,31 @@ get_40bit_a_delta(uint64_t value0, uint64_t value1)
> return value1 - value0;
> }
>
>+static void
>+accumulate_uint32(size_t offset,
>+ uint32_t *report0,
>+ uint32_t *report1,
>+ uint64_t *delta)
>+{
>+ uint32_t value0 = *(uint32_t *)(((uint8_t *)report0) + offset);
>+ uint32_t value1 = *(uint32_t *)(((uint8_t *)report1) + offset);
>+
>+ *delta += (uint32_t)(value1 - value0);
>+}
>+
>+static void
>+accumulate_uint40(int a_index,
>+ uint32_t *report0,
>+ uint32_t *report1,
>+ enum intel_xe_oa_format_name format,
>+ uint64_t *delta)
>+{
>+ uint64_t value0 = read_40bit_a_counter(report0, format, a_index),
>+ value1 = read_40bit_a_counter(report1, format, a_index);
>+
>+ *delta += get_40bit_a_delta(value0, value1);
>+}
>+
> static void
> accumulate_uint64(int a_index,
> const uint32_t *report0,
>@@ -754,6 +815,78 @@ accumulate_uint64(int a_index,
> *delta += (value1 - value0);
> }
>
>+static void
>+accumulate_reports(struct accumulator *accumulator,
>+ uint32_t *start,
>+ uint32_t *end)
>+{
>+ struct oa_format format = get_oa_format(accumulator->format);
>+ uint64_t *deltas = accumulator->deltas;
>+ int idx = 0;
>+
>+ /* timestamp */
>+ deltas[idx] += oa_timestamp_delta(end, start, accumulator->format);
>+ idx++;
>+
>+ /* clock cycles */
>+ deltas[idx] += oa_tick_delta(end, start, accumulator->format);
>+ idx++;
>+
>+ for (int i = 0; i < format.n_a40; i++) {
>+ accumulate_uint40(i, start, end, accumulator->format,
>+ deltas + idx++);
>+ }
>+
>+ for (int i = 0; i < format.n_a64; i++) {
>+ accumulate_uint64(i, start, end, accumulator->format,
>+ deltas + idx++);
>+ }
>+
>+ for (int i = 0; i < format.n_a; i++) {
>+ accumulate_uint32(format.a_off + 4 * i,
>+ start, end, deltas + idx++);
>+ }
>+
>+ for (int i = 0; i < format.n_b; i++) {
>+ accumulate_uint32(format.b_off + 4 * i,
>+ start, end, deltas + idx++);
>+ }
>+
>+ for (int i = 0; i < format.n_c; i++) {
>+ accumulate_uint32(format.c_off + 4 * i,
>+ start, end, deltas + idx++);
>+ }
>+}
>+
>+static void
>+accumulator_print(struct accumulator *accumulator, const char *title)
>+{
>+ struct oa_format format = get_oa_format(accumulator->format);
>+ uint64_t *deltas = accumulator->deltas;
>+ int idx = 0;
>+
>+ igt_debug("%s:\n", title);
>+ igt_debug("\ttime delta = %"PRIu64"\n", deltas[idx++]);
>+ igt_debug("\tclock cycle delta = %"PRIu64"\n", deltas[idx++]);
>+
>+ for (int i = 0; i < format.n_a40; i++)
>+ igt_debug("\tA%u = %"PRIu64"\n", i, deltas[idx++]);
>+
>+ for (int i = 0; i < format.n_a64; i++)
>+ igt_debug("\tA64_%u = %"PRIu64"\n", i, deltas[idx++]);
>+
>+ for (int i = 0; i < format.n_a; i++) {
>+ int a_id = format.first_a + i;
>+ igt_debug("\tA%u = %"PRIu64"\n", a_id, deltas[idx++]);
>+ }
>+
>+ for (int i = 0; i < format.n_a; i++)
>+ igt_debug("\tB%u = %"PRIu64"\n", i, deltas[idx++]);
>+
>+ for (int i = 0; i < format.n_c; i++)
>+ igt_debug("\tC%u = %"PRIu64"\n", i, deltas[idx++]);
>+}
>+
> /* The TestOa metric set is designed so */
> static void
> sanity_check_reports(const uint32_t *oa_report0, const uint32_t *oa_report1,
>@@ -2165,6 +2298,92 @@ static void test_polling_small_buf(void)
> 0.20 * n_expect_read_bytes);
> }
>
>+static int
>+num_valid_reports_captured(struct intel_xe_oa_open_prop *param,
>+ int64_t *duration_ns, int fmt)
>+{
>+ uint8_t buf[1024 * 1024];
>+ int64_t start, end;
>+ int num_reports = 0;
>+ size_t format_size = get_oa_format(fmt).size;
>+
>+ igt_debug("Expected duration = %"PRId64"\n", *duration_ns);
>+
>+ stream_fd = __perf_open(drm_fd, param, true);
>+
>+ start = get_time();
>+ do_ioctl(stream_fd, DRM_XE_PERF_IOCTL_ENABLE, 0);
>+ for (/* nop */; ((end = get_time()) - start) < *duration_ns; /* nop */) {
>+ int ret;
>+
>+ while ((ret = read(stream_fd, buf, sizeof(buf))) < 0 &&
>+ (errno == EINTR || errno == EIO))
>+ ;
>+
>+ igt_assert(ret > 0);
>+
>+ for (int offset = 0; offset < ret; offset += format_size) {
>+ uint32_t *report = (void *)(buf + offset);
>+
>+ if (report_reason(report) & OAREPORT_REASON_TIMER)
>+ num_reports++;
>+ }
>+ }
>+ __perf_close(stream_fd);
>+
>+ *duration_ns = end - start;
>+
>+ igt_debug("Actual duration = %"PRIu64"\n", *duration_ns);
>+
>+ return num_reports;
>+}
>+
>+/**
>+ * SUBTEST: oa-tlb-invalidate
>+ * Description: Open OA stream twice to verify OA TLB invalidation
>+ */
>+static void
>+test_oa_tlb_invalidate(const struct drm_xe_engine_class_instance *hwe)
>+{
>+ int oa_exponent = max_oa_exponent_for_period_lte(30000000);
>+ struct intel_xe_perf_metric_set *test_set = metric_set(hwe);
>+ uint64_t properties[] = {
>+ DRM_XE_OA_PROPERTY_OA_UNIT_ID, 0,
>+ DRM_XE_OA_PROPERTY_SAMPLE_OA, true,
>+
>+ DRM_XE_OA_PROPERTY_OA_METRIC_SET, test_set->perf_oa_metrics_set,
>+ DRM_XE_OA_PROPERTY_OA_FORMAT, __ff(test_set->perf_oa_format),
>+ DRM_XE_OA_PROPERTY_OA_PERIOD_EXPONENT, oa_exponent,
>+ DRM_XE_OA_PROPERTY_OA_DISABLED, true,
>+ DRM_XE_OA_PROPERTY_OA_ENGINE_INSTANCE, hwe->engine_instance,
>+ };
>+ struct intel_xe_oa_open_prop param = {
>+ .num_properties = ARRAY_SIZE(properties) / 2,
>+ .properties_ptr = to_user_pointer(properties),
>+ };
>+ int num_reports1, num_reports2, num_expected_reports;
>+ int64_t duration;
>+
>+ /* Capture reports for 5 seconds twice and then make sure you get around
>+ * the same number of reports. In the case of failure, the number of
>+ * reports will vary largely since the beginning of the OA buffer
>+ * will have invalid entries.
>+ */
>+ duration = 5LL * NSEC_PER_SEC;
>+ num_reports1 = num_valid_reports_captured(¶m, &duration, test_set->perf_oa_format);
>+ num_expected_reports = duration / oa_exponent_to_ns(oa_exponent);
>+ igt_debug("expected num reports = %d\n", num_expected_reports);
>+ igt_debug("actual num reports = %d\n", num_reports1);
>+ igt_assert(num_reports1 > 0.95 * num_expected_reports);
>+
>+ duration = 5LL * NSEC_PER_SEC;
>+ num_reports2 = num_valid_reports_captured(¶m, &duration, test_set->perf_oa_format);
>+ num_expected_reports = duration / oa_exponent_to_ns(oa_exponent);
>+ igt_debug("expected num reports = %d\n", num_expected_reports);
>+ igt_debug("actual num reports = %d\n", num_reports2);
>+ igt_assert(num_reports2 > 0.95 * num_expected_reports);
>+}
>+
> /**
> * SUBTEST: buffer-fill
> * Description: Test filling, wraparound and overflow of OA buffer
>@@ -2750,6 +2969,468 @@ test_disabled_read_error(void)
> __perf_close(stream_fd);
> }
>
>+/**
>+ * SUBTEST: mi-rpc
>+ * Description: Test OAR/OAC using MI_REPORT_PERF_COUNT
>+ */
>+static void
>+test_mi_rpc(struct drm_xe_engine_class_instance *hwe)
>+
>+{
>+ uint64_t fmt = ((IS_DG2(devid) || IS_METEORLAKE(devid)) &&
>+ hwe->engine_class == DRM_XE_ENGINE_CLASS_COMPUTE) ?
>+ XE_OAC_FORMAT_A24u64_B8_C8 : oar_unit_default_format();
>+ struct intel_xe_perf_metric_set *test_set = metric_set(hwe);
>+ uint64_t properties[] = {
>+ DRM_XE_OA_PROPERTY_OA_UNIT_ID, 0,
>+
>+ /* On Gen12, MI RPC uses OAR. OAR is configured only for the
>+ * render context that wants to measure the performance. Hence a
>+ * context must be specified in the gen12 MI RPC when compared
>+ * to previous gens.
>+ *
>+ * Have a random value here for the context id, but initialize
>+ * it once you figure out the context ID for the work to be
>+ * measured
>+ */
>+ DRM_XE_OA_PROPERTY_EXEC_QUEUE_ID, UINT64_MAX,
>+
>+ /* OA unit configuration:
>+ * DRM_XE_OA_PROPERTY_SAMPLE_OA is no longer required for Gen12
>+ * because the OAR unit increments counters only for the
>+ * relevant context. No other parameters are needed since we do
>+ * not rely on the OA buffer anymore to normalize the counter
>+ * values.
>+ */
>+ DRM_XE_OA_PROPERTY_OA_METRIC_SET, test_set->perf_oa_metrics_set,
>+ DRM_XE_OA_PROPERTY_OA_FORMAT, __ff(fmt),
>+ DRM_XE_OA_PROPERTY_OA_ENGINE_INSTANCE, hwe->engine_instance,
>+ };
>+ struct intel_xe_oa_open_prop param = {
>+ .num_properties = ARRAY_SIZE(properties) / 2,
>+ .properties_ptr = to_user_pointer(properties),
>+ };
>+ struct buf_ops *bops;
>+ struct intel_bb *ibb;
>+ struct intel_buf *buf;
>+#define INVALID_CTX_ID 0xffffffff
>+ uint32_t ctx_id = INVALID_CTX_ID;
>+ uint32_t vm = 0;
>+ uint32_t *report32;
>+ size_t format_size_32;
>+ struct oa_format format = get_oa_format(fmt);
>+
>+ /* Ensure perf_stream_paranoid is set to 1 by default */
>+ write_u64_file("/proc/sys/dev/xe/perf_stream_paranoid", 1);
>+
>+ bops = buf_ops_create(drm_fd);
>+ vm = xe_vm_create(drm_fd, 0, 0);
>+ ctx_id = xe_exec_queue_create(drm_fd, vm, hwe, 0);
>+ igt_assert_neq(ctx_id, INVALID_CTX_ID);
>+ properties[3] = ctx_id;
>+
>+ ibb = intel_bb_create_with_context(drm_fd, ctx_id, vm, NULL, BATCH_SZ);
>+ buf = intel_buf_create(bops, 4096, 1, 8, 64,
>+ I915_TILING_NONE, I915_COMPRESSION_NONE);
>+
>+ buf_map(drm_fd, buf, true);
>+ memset(buf->ptr, 0x80, 4096);
>+ intel_buf_unmap(buf);
>+
>+ stream_fd = __perf_open(drm_fd, ¶m, false);
>+ set_fd_flags(stream_fd, O_CLOEXEC);
>+
>+#define REPORT_ID 0xdeadbeef
>+#define REPORT_OFFSET 0
>+ emit_report_perf_count(ibb,
>+ buf,
>+ REPORT_OFFSET,
>+ REPORT_ID);
>+ intel_bb_flush_render(ibb);
>+ intel_bb_sync(ibb);
>+
>+ buf_map(drm_fd, buf, false);
>+ report32 = buf->ptr;
>+ format_size_32 = format.size >> 2;
>+ dump_report(report32, format_size_32, "mi-rpc");
>+
>+ /* Sanity check reports
>+ * reportX_32[0]: report id passed with mi-rpc
>+ * reportX_32[1]: timestamp. NOTE: wraps around in ~6 minutes.
>+ *
>+ * reportX_32[format.b_off]: check if the entire report was filled.
>+ * B0 counter falls in the last 64 bytes of this report format.
>+ * Since reports are filled in 64 byte blocks, we should be able to
>+ * assure that the report was filled by checking the B0 counter. B0
>+ * counter is defined to be zero, so we can easily validate it.
>+ *
>+ * reportX_32[format_size_32]: outside report, make sure only the report
>+ * size amount of data was written.
>+ */
>+ igt_assert_eq(report32[0], REPORT_ID);
>+ igt_assert(oa_timestamp(report32, test_set->perf_oa_format));
>+ igt_assert_neq(report32[format.b_off >> 2], 0x80808080);
>+ igt_assert_eq(report32[format_size_32], 0x80808080);
>+
>+ intel_buf_unmap(buf);
>+ intel_buf_destroy(buf);
>+ intel_bb_destroy(ibb);
>+ xe_exec_queue_destroy(drm_fd, ctx_id);
>+ xe_vm_destroy(drm_fd, vm);
>+ buf_ops_destroy(bops);
>+ __perf_close(stream_fd);
>+}
>+
>+static void
>+emit_stall_timestamp_and_rpc(struct intel_bb *ibb,
>+ struct intel_buf *dst,
>+ int timestamp_offset,
>+ int report_dst_offset,
>+ uint32_t report_id)
>+{
>+ uint32_t pipe_ctl_flags = (PIPE_CONTROL_CS_STALL |
>+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
>+ PIPE_CONTROL_WRITE_TIMESTAMP);
>+
>+ intel_bb_add_intel_buf(ibb, dst, true);
>+ intel_bb_out(ibb, GFX_OP_PIPE_CONTROL(6));
>+ intel_bb_out(ibb, pipe_ctl_flags);
>+ intel_bb_emit_reloc(ibb, dst->handle,
>+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
>+ timestamp_offset, dst->addr.offset);
>+ intel_bb_out(ibb, 0); /* imm lower */
>+ intel_bb_out(ibb, 0); /* imm upper */
>+
>+ emit_report_perf_count(ibb, dst, report_dst_offset, report_id);
>+}
>+
>+static void single_ctx_helper(struct drm_xe_engine_class_instance *hwe)
>+{
>+ struct intel_xe_perf_metric_set *test_set = metric_set(hwe);
>+ uint64_t fmt = oar_unit_default_format();
>+ uint64_t properties[] = {
>+ DRM_XE_OA_PROPERTY_OA_UNIT_ID, 0,
>+
>+ /* Have a random value here for the context id, but initialize
>+ * it once you figure out the context ID for the work to be
>+ * measured
>+ */
>+ DRM_XE_OA_PROPERTY_EXEC_QUEUE_ID, UINT64_MAX,
>+
>+ /* OA unit configuration:
>+ * DRM_XE_OA_PROPERTY_SAMPLE_OA is no longer required for Gen12
>+ * because the OAR unit increments counters only for the
>+ * relevant context. No other parameters are needed since we do
>+ * not rely on the OA buffer anymore to normalize the counter
>+ * values.
>+ */
>+ DRM_XE_OA_PROPERTY_OA_METRIC_SET, test_set->perf_oa_metrics_set,
>+ DRM_XE_OA_PROPERTY_OA_FORMAT, __ff(fmt),
>+ DRM_XE_OA_PROPERTY_OA_ENGINE_INSTANCE, hwe->engine_instance,
>+ };
>+ struct intel_xe_oa_open_prop param = {
>+ .num_properties = ARRAY_SIZE(properties) / 2,
>+ .properties_ptr = to_user_pointer(properties),
>+ };
>+ struct buf_ops *bops;
>+ struct intel_bb *ibb0, *ibb1;
>+ struct intel_buf src[3], dst[3], *dst_buf;
>+ uint32_t context0_id, context1_id, vm = 0;
>+ uint32_t *report0_32, *report1_32, *report2_32, *report3_32;
>+ uint64_t timestamp0_64, timestamp1_64;
>+ uint64_t delta_ts64, delta_oa32;
>+ uint64_t delta_ts64_ns, delta_oa32_ns;
>+ uint64_t delta_delta;
>+ int width = 800;
>+ int height = 600;
>+#define INVALID_CTX_ID 0xffffffff
>+ uint32_t ctx0_id = INVALID_CTX_ID;
>+ uint32_t ctx1_id = INVALID_CTX_ID;
>+ int ret;
>+ struct accumulator accumulator = {
>+ .format = fmt
>+ };
>+
>+ bops = buf_ops_create(drm_fd);
>+
>+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
>+ scratch_buf_init(bops, &src[i], width, height, 0xff0000ff);
>+ scratch_buf_init(bops, &dst[i], width, height, 0x00ff00ff);
>+ }
>+
>+ vm = xe_vm_create(drm_fd, 0, 0);
>+ context0_id = xe_exec_queue_create(drm_fd, vm, hwe, 0);
>+ context1_id = xe_exec_queue_create(drm_fd, vm, hwe, 0);
>+ ibb0 = intel_bb_create_with_context(drm_fd, context0_id, vm, NULL, BATCH_SZ);
>+ ibb1 = intel_bb_create_with_context(drm_fd, context1_id, vm, NULL, BATCH_SZ);
>+
>+ igt_debug("submitting warm up render_copy\n");
>+
>+ /* Submit some early, unmeasured, work to the context we want */
>+ render_copy(ibb0,
>+ &src[0], 0, 0, width, height,
>+ &dst[0], 0, 0);
>+
>+ /* Initialize the context parameter to the perf open ioctl here */
>+ properties[3] = context0_id;
>+
>+ igt_debug("opening xe oa stream\n");
>+ stream_fd = __perf_open(drm_fd, ¶m, false);
>+ set_fd_flags(stream_fd, O_CLOEXEC);
>+
>+ dst_buf = intel_buf_create(bops, 4096, 1, 8, 64,
>+ I915_TILING_NONE,
>+ I915_COMPRESSION_NONE);
>+
>+ /* Set write domain to cpu briefly to fill the buffer with 80s */
>+ buf_map(drm_fd, dst_buf, true /* write enable */);
>+ memset(dst_buf->ptr, 0x80, 2048);
>+ memset((uint8_t *) dst_buf->ptr + 2048, 0, 2048);
>+ intel_buf_unmap(dst_buf);
>+
>+ /* Submit an mi-rpc to context0 before measurable work */
>+#define BO_TIMESTAMP_OFFSET0 1024
>+#define BO_REPORT_OFFSET0 0
>+#define BO_REPORT_ID0 0xdeadbeef
>+ emit_stall_timestamp_and_rpc(ibb0,
>+ dst_buf,
>+ BO_TIMESTAMP_OFFSET0,
>+ BO_REPORT_OFFSET0,
>+ BO_REPORT_ID0);
>+ intel_bb_flush_render(ibb0);
>+
>+ /* Remove intel_buf from ibb0 added implicitly in rendercopy */
>+ intel_bb_remove_intel_buf(ibb0, dst_buf);
>+
>+ /* This is the work/context that is measured for counter increments */
>+ render_copy(ibb0,
>+ &src[0], 0, 0, width, height,
>+ &dst[0], 0, 0);
>+ intel_bb_flush_render(ibb0);
>+
>+ /* Submit an mi-rpc to context1 before work
>+ *
>+ * On gen12, this measurement should just yield counters that are
>+ * all zeroes, since the counters will only increment for the
>+ * context passed to perf open ioctl
>+ */
>+#define BO_TIMESTAMP_OFFSET2 1040
>+#define BO_REPORT_OFFSET2 512
>+#define BO_REPORT_ID2 0x00c0ffee
>+ emit_stall_timestamp_and_rpc(ibb1,
>+ dst_buf,
>+ BO_TIMESTAMP_OFFSET2,
>+ BO_REPORT_OFFSET2,
>+ BO_REPORT_ID2);
>+ intel_bb_flush_render(ibb1);
>+
>+ /* Submit two copies on the other context to avoid a false
>+ * positive in case the driver somehow ended up filtering for
>+ * context1
>+ */
>+ render_copy(ibb1,
>+ &src[1], 0, 0, width, height,
>+ &dst[1], 0, 0);
>+
>+ render_copy(ibb1,
>+ &src[2], 0, 0, width, height,
>+ &dst[2], 0, 0);
>+ intel_bb_flush_render(ibb1);
>+
>+ /* Submit an mi-rpc to context1 after all work */
>+#define BO_TIMESTAMP_OFFSET3 1048
>+#define BO_REPORT_OFFSET3 768
>+#define BO_REPORT_ID3 0x01c0ffee
>+ emit_stall_timestamp_and_rpc(ibb1,
>+ dst_buf,
>+ BO_TIMESTAMP_OFFSET3,
>+ BO_REPORT_OFFSET3,
>+ BO_REPORT_ID3);
>+ intel_bb_flush_render(ibb1);
>+
>+ /* Remove intel_buf from ibb1 added implicitly in rendercopy */
>+ intel_bb_remove_intel_buf(ibb1, dst_buf);
>+
>+ /* Submit an mi-rpc to context0 after all measurable work */
>+#define BO_TIMESTAMP_OFFSET1 1032
>+#define BO_REPORT_OFFSET1 256
>+#define BO_REPORT_ID1 0xbeefbeef
>+ emit_stall_timestamp_and_rpc(ibb0,
>+ dst_buf,
>+ BO_TIMESTAMP_OFFSET1,
>+ BO_REPORT_OFFSET1,
>+ BO_REPORT_ID1);
>+ intel_bb_flush_render(ibb0);
>+ intel_bb_sync(ibb0);
>+ intel_bb_sync(ibb1);
>+
>+ buf_map(drm_fd, dst_buf, false);
>+
>+ /* Sanity check reports
>+ * reportX_32[0]: report id passed with mi-rpc
>+ * reportX_32[1]: timestamp
>+ * reportX_32[2]: context id
>+ *
>+ * report0_32: start of measurable work
>+ * report1_32: end of measurable work
>+ * report2_32: start of other work
>+ * report3_32: end of other work
>+ */
>+ report0_32 = dst_buf->ptr;
>+ igt_assert_eq(report0_32[0], 0xdeadbeef);
>+ igt_assert(oa_timestamp(report0_32, fmt));
>+ ctx0_id = report0_32[2];
>+ igt_debug("MI_RPC(start) CTX ID: %u\n", ctx0_id);
>+ dump_report(report0_32, 64, "report0_32");
>+
>+ report1_32 = report0_32 + 64;
>+ igt_assert_eq(report1_32[0], 0xbeefbeef);
>+ igt_assert(oa_timestamp(report1_32, fmt));
>+ ctx1_id = report1_32[2];
>+ igt_debug("CTX ID1: %u\n", ctx1_id);
>+ dump_report(report1_32, 64, "report1_32");
>+
>+ /* Verify that counters in context1 are all zeroes */
>+ report2_32 = report0_32 + 128;
>+ igt_assert_eq(report2_32[0], 0x00c0ffee);
>+ igt_assert(oa_timestamp(report2_32, fmt));
>+ dump_report(report2_32, 64, "report2_32");
>+ igt_assert_eq(0, memcmp(&report2_32[4],
>+ (uint8_t *) dst_buf->ptr + 2048,
>+ 240));
>+
>+ report3_32 = report0_32 + 192;
>+ igt_assert_eq(report3_32[0], 0x01c0ffee);
>+ igt_assert(oa_timestamp(report3_32, fmt));
>+ dump_report(report3_32, 64, "report3_32");
>+ igt_assert_eq(0, memcmp(&report3_32[4],
>+ (uint8_t *) dst_buf->ptr + 2048,
>+ 240));
>+
>+ /* Accumulate deltas for counters - A0, A21 and A26 */
>+ memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
>+ accumulate_reports(&accumulator, report0_32, report1_32);
>+ igt_debug("total: A0 = %"PRIu64", A21 = %"PRIu64", A26 = %"PRIu64"\n",
>+ accumulator.deltas[2 + 0],
>+ accumulator.deltas[2 + 21],
>+ accumulator.deltas[2 + 26]);
>+
>+ igt_debug("oa_timestamp32 0 = %"PRIu64"\n", oa_timestamp(report0_32, fmt));
>+ igt_debug("oa_timestamp32 1 = %"PRIu64"\n", oa_timestamp(report1_32, fmt));
>+ igt_debug("ctx_id 0 = %u\n", report0_32[2]);
>+ igt_debug("ctx_id 1 = %u\n", report1_32[2]);
>+
>+ /* The delta as calculated via the PIPE_CONTROL timestamp or
>+ * the OA report timestamps should be almost identical but
>+ * allow a 500 nanoseconds margin.
>+ */
>+ timestamp0_64 = *(uint64_t *)(((uint8_t *)dst_buf->ptr) + BO_TIMESTAMP_OFFSET0);
>+ timestamp1_64 = *(uint64_t *)(((uint8_t *)dst_buf->ptr) + BO_TIMESTAMP_OFFSET1);
>+
>+ igt_debug("ts_timestamp64 0 = %"PRIu64"\n", timestamp0_64);
>+ igt_debug("ts_timestamp64 1 = %"PRIu64"\n", timestamp1_64);
>+
>+ delta_ts64 = timestamp1_64 - timestamp0_64;
>+ delta_oa32 = oa_timestamp_delta(report1_32, report0_32, fmt);
>+
>+ /* Sanity check that we can pass the delta to timebase_scale */
>+ delta_oa32_ns = timebase_scale(delta_oa32);
>+ delta_ts64_ns = cs_timebase_scale(delta_ts64);
>+
>+ igt_debug("oa32 delta = %"PRIu64", = %"PRIu64"ns\n",
>+ delta_oa32, delta_oa32_ns);
>+ igt_debug("ts64 delta = %"PRIu64", = %"PRIu64"ns\n",
>+ delta_ts64, delta_ts64_ns);
>+
>+ delta_delta = delta_ts64_ns > delta_oa32_ns ?
>+ (delta_ts64_ns - delta_oa32_ns) :
>+ (delta_oa32_ns - delta_ts64_ns);
>+ if (delta_delta > 500) {
>+ igt_debug("delta_delta = %"PRIu64". exceeds margin, skipping..\n",
>+ delta_delta);
>+ exit(EAGAIN);
>+ }
>+
>+ igt_debug("n samples written = %"PRIu64"/%"PRIu64" (%ix%i)\n",
>+ accumulator.deltas[2 + 21],
>+ accumulator.deltas[2 + 26],
>+ width, height);
>+ accumulator_print(&accumulator, "filtered");
>+
>+ /* Verify that the work actually happened by comparing the src
>+ * and dst buffers
>+ */
>+ buf_map(drm_fd, &src[0], false);
>+ buf_map(drm_fd, &dst[0], false);
>+
>+ ret = memcmp(src[0].ptr, dst[0].ptr, 4 * width * height);
>+ intel_buf_unmap(&src[0]);
>+ intel_buf_unmap(&dst[0]);
>+
>+ if (ret != 0) {
>+ accumulator_print(&accumulator, "total");
>+ exit(EAGAIN);
>+ }
>+
>+ /* FIXME: can we deduce the presence of A26 from get_oa_format(fmt)? */
>+ if (intel_graphics_ver(devid) >= IP_VER(20, 0))
>+ goto skip_check;
>+
>+ /* Check that this test passed. The test measures the number of 2x2
>+ * samples written to the render target using the counter A26. For
>+ * OAR, this counter will only have increments relevant to this specific
>+ * context. The value equals the width * height of the rendered work.
>+ */
>+ igt_assert_eq(accumulator.deltas[2 + 26], width * height);
>+
>+ skip_check:
>+ /* Clean up */
>+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
>+ intel_buf_close(bops, &src[i]);
>+ intel_buf_close(bops, &dst[i]);
>+ }
>+
>+ intel_buf_unmap(dst_buf);
>+ intel_buf_destroy(dst_buf);
>+ intel_bb_destroy(ibb0);
>+ intel_bb_destroy(ibb1);
>+ xe_exec_queue_destroy(drm_fd, context0_id);
>+ xe_exec_queue_destroy(drm_fd, context1_id);
>+ xe_vm_destroy(drm_fd, vm);
>+ buf_ops_destroy(bops);
>+ __perf_close(stream_fd);
>+}
>+
>+/**
>+ * SUBTEST: unprivileged-single-ctx-counters
>+ * Description: A harder test for OAR/OAC using MI_REPORT_PERF_COUNT
>+ */
>+static void
>+test_single_ctx_render_target_writes_a_counter(struct drm_xe_engine_class_instance *hwe)
>+{
>+ int child_ret;
>+ struct igt_helper_process child = {};
>+
>+ /* Ensure perf_stream_paranoid is set to 1 by default */
>+ write_u64_file("/proc/sys/dev/xe/perf_stream_paranoid", 1);
>+
>+ do {
>+ igt_fork_helper(&child) {
>+ /* A local device for local resources. */
>+ drm_fd = drm_reopen_driver(drm_fd);
>+
>+ igt_drop_root();
>+
>+ single_ctx_helper(hwe);
>+
>+ drm_close_driver(drm_fd);
>+ }
>+ child_ret = igt_wait_helper(&child);
>+ igt_assert(WEXITSTATUS(child_ret) == EAGAIN ||
>+ WEXITSTATUS(child_ret) == 0);
>+ } while (WEXITSTATUS(child_ret) == EAGAIN);
>+}
>+
> static unsigned read_xe_module_ref(void)
> {
> FILE *fp = fopen("/proc/modules", "r");
>@@ -3017,6 +3698,23 @@ igt_main
> igt_subtest("short-reads")
> test_short_reads();
>
>+ igt_subtest_group {
>+ igt_subtest_with_dynamic("mi-rpc")
>+ __for_one_hwe_in_oag(hwe)
>+ test_mi_rpc(hwe);
>+
>+ igt_subtest_with_dynamic("oa-tlb-invalidate")
>+ __for_one_hwe_in_oag(hwe)
>+ test_oa_tlb_invalidate(hwe);
>+
>+ igt_subtest_with_dynamic("unprivileged-single-ctx-counters") {
>+ igt_require_f(render_copy, "no render-copy function\n");
>+ igt_require(intel_graphics_ver(devid) < IP_VER(20, 0));
>+ __for_one_render_engine(hwe)
>+ test_single_ctx_render_target_writes_a_counter(hwe);
>+ }
>+ }
>+
> igt_fixture {
> /* leave sysctl options in their default state... */
> write_u64_file("/proc/sys/dev/xe/perf_stream_paranoid", 1);
>--
>2.41.0
>
More information about the igt-dev
mailing list