[Intel-gfx] [PATCH igt] igt/perf: Busywait for MI_REPORT_PERF_COUNT results

Chris Wilson chris at chris-wilson.co.uk
Fri Dec 8 14:31:51 UTC 2017


On Haswell, at least, MI_REPORT_PERF_COUNT is not flushed by the
PIPECONTROL surrounding the batch. (In theory, before the breadcrumb is
updated the CPU's view of memory is coherent with the GPU, i.e. all
writes have landed and are visible to userspace. This does not appear to
be the case for MI_REPORT_PERF_COUNT.)

As MI_RPC does not apear to be synchronized with the batch, busyspin for
its completion.

(This has far deeper implications; since it means the GPU can still be
writing to memory after release.)

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
Cc: Matthew Auld <matthew.auld at intel.com>
---
 tests/perf.c | 93 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/tests/perf.c b/tests/perf.c
index a161c45d7..8c20fbe09 100644
--- a/tests/perf.c
+++ b/tests/perf.c
@@ -706,47 +706,59 @@ emit_report_perf_count(struct intel_batchbuffer *batch,
 }
 
 static uint32_t
-i915_get_one_gpu_timestamp(uint32_t *context_id)
+i915_get_one_gpu_timestamp(void)
 {
-	drm_intel_bufmgr *bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
-	drm_intel_context *mi_rpc_ctx = drm_intel_gem_context_create(bufmgr);
-	drm_intel_bo *mi_rpc_bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
-	struct intel_batchbuffer *mi_rpc_batch = intel_batchbuffer_alloc(bufmgr, devid);
-	int ret;
-	uint32_t timestamp;
-
-	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
-
-	if (context_id) {
-		ret = drm_intel_gem_context_get_id(mi_rpc_ctx, context_id);
-		igt_assert_eq(ret, 0);
-	}
-
-	igt_assert(mi_rpc_ctx);
-	igt_assert(mi_rpc_bo);
-	igt_assert(mi_rpc_batch);
-
-	ret = drm_intel_bo_map(mi_rpc_bo, true);
-	igt_assert_eq(ret, 0);
-	memset(mi_rpc_bo->virtual, 0x80, 4096);
-	drm_intel_bo_unmap(mi_rpc_bo);
-
-	emit_report_perf_count(mi_rpc_batch,
-			       mi_rpc_bo, /* dst */
-			       0, /* dst offset in bytes */
-			       0xdeadbeef); /* report ID */
+	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_relocation_entry reloc;
+	uint32_t *ptr, timestamp;
+	struct timespec tv = {};
+	int i;
 
-	intel_batchbuffer_flush_with_context(mi_rpc_batch, mi_rpc_ctx);
+	memset(obj, 0, sizeof(obj));
+	obj[0].handle = gem_create(drm_fd, 4096);
+	ptr = gem_mmap__cpu(drm_fd, obj[0].handle, 0, 4096, PROT_WRITE);
+	memset(ptr, 0x80, 4096);
+	munmap(ptr, 4096);
+
+	obj[1].handle = gem_create(drm_fd, 4096);
+	obj[1].relocs_ptr = to_user_pointer(&reloc);
+	obj[1].relocation_count = 1;
+	ptr = gem_mmap__cpu(drm_fd, obj[1].handle, 0, 4096, PROT_WRITE);
+
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj[0].handle;
+	reloc.offset = sizeof(uint32_t);
+	reloc.read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc.write_domain = I915_GEM_DOMAIN_RENDER;
+
+	i = 2;
+	ptr[0] = GEN6_MI_REPORT_PERF_COUNT;
+	if (intel_gen(devid) >= 8)
+		ptr[0]++, i++; /* 64b reloc */
+	ptr[i++] = 0xdeadbeef;
+	ptr[i] = MI_BATCH_BUFFER_END;
+	munmap(ptr, 4096);
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffers_ptr = to_user_pointer(obj);
+	execbuf.buffer_count = 2;
+	execbuf.batch_len = 4096;
+	gem_execbuf(drm_fd, &execbuf);
+	gem_close(drm_fd, obj[1].handle);
 
-	ret = drm_intel_bo_map(mi_rpc_bo, false /* write enable */);
-	igt_assert_eq(ret, 0);
-	timestamp = ((uint32_t *)mi_rpc_bo->virtual)[1];
-	drm_intel_bo_unmap(mi_rpc_bo);
+	/*
+	 * MI_REPORT_PERF_COUNT is unserialised, i.e. not flushed by
+	 * the PIPECONTROLs surrounding batch execution. Ergo, we must
+	 * manually wait.
+	 */
+	do {
+		gem_read(drm_fd, obj[0].handle, sizeof(uint32_t),
+			 &timestamp, sizeof(timestamp));
+	} while (timestamp == 0x80808080 && !igt_seconds_elapsed(&tv));
+	gem_close(drm_fd, obj[0].handle);
 
-	drm_intel_bo_unreference(mi_rpc_bo);
-	intel_batchbuffer_free(mi_rpc_batch);
-	drm_intel_gem_context_destroy(mi_rpc_ctx);
-	drm_intel_bufmgr_destroy(bufmgr);
+	igt_assert_neq(timestamp, 0x80808080);
 
 	return timestamp;
 }
@@ -1915,7 +1927,6 @@ test_oa_exponents(void)
 			uint32_t n_reports = 0;
 			uint32_t n_idle_reports = 0;
 			uint32_t n_reads = 0;
-			uint32_t context_id;
 			uint64_t first_timestamp = 0;
 			bool check_first_timestamp = true;
 			struct drm_i915_perf_record_header *header;
@@ -1944,7 +1955,7 @@ test_oa_exponents(void)
 			 * first timestamp as way to filter previously
 			 * scheduled work that would have configured
 			 * the OA unit at a different period. */
-			first_timestamp = i915_get_one_gpu_timestamp(&context_id);
+			first_timestamp = i915_get_one_gpu_timestamp();
 
 			while (n_reads < ARRAY_SIZE(reads) &&
 			       n_reports < ARRAY_SIZE(reports)) {
@@ -2070,8 +2081,8 @@ test_oa_exponents(void)
 				uint32_t *rpt = NULL, *last = NULL, *last_periodic = NULL;
 
 				igt_debug(" > More than 5%% error: avg_ts_delta = %"PRIu64", delta_delta = %"PRIu64", "
-					  "expected_delta = %"PRIu64", first_timestamp = %"PRIu64" ctx_id=%"PRIu32"\n",
-					  average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp, context_id);
+					  "expected_delta = %"PRIu64", first_timestamp = %"PRIu64"\n",
+					  average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp);
 				for (int i = 0; i < (n_reports - 1); i++) {
 					/* XXX: calculating with u32 arithmetic to account for overflow */
 					uint32_t u32_delta =
-- 
2.15.1



More information about the Intel-gfx mailing list