[Intel-gfx] [PATCH igt] igt/perf: Busywait for MI_REPORT_PERF_COUNT results

Fri Dec 8 15:08:49 UTC 2017

Hmm that sucks...
I'll bring this up with hardware people.

Maybe replacing this with a MI_STORE_REGISTER_MEM of the RCS timestamp 
register (least significant 32bits) is a better approach.

On 08/12/17 14:31, Chris Wilson wrote:
> On Haswell, at least, MI_REPORT_PERF_COUNT is not flushed by the
> PIPECONTROL surrounding the batch. (In theory, before the breadcrumb is
> updated the CPU's view of memory is coherent with the GPU, i.e. all
> writes have landed and are visible to userspace. This does not appear to
> be the case for MI_REPORT_PERF_COUNT.)
>
> As MI_RPC does not apear to be synchronized with the batch, busyspin for
> its completion.
>
> (This has far deeper implications; since it means the GPU can still be
> writing to memory after release.)
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>
> ---
>   tests/perf.c | 93 +++++++++++++++++++++++++++++++++---------------------------
>   1 file changed, 52 insertions(+), 41 deletions(-)
>
> diff --git a/tests/perf.c b/tests/perf.c
> index a161c45d7..8c20fbe09 100644
> --- a/tests/perf.c
> +++ b/tests/perf.c
> @@ -706,47 +706,59 @@ emit_report_perf_count(struct intel_batchbuffer *batch,
>   }
>   
>   static uint32_t
> -i915_get_one_gpu_timestamp(uint32_t *context_id)
> +i915_get_one_gpu_timestamp(void)
>   {
> -	drm_intel_bufmgr *bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
> -	drm_intel_context *mi_rpc_ctx = drm_intel_gem_context_create(bufmgr);
> -	drm_intel_bo *mi_rpc_bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
> -	struct intel_batchbuffer *mi_rpc_batch = intel_batchbuffer_alloc(bufmgr, devid);
> -	int ret;
> -	uint32_t timestamp;
> -
> -	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
> -
> -	if (context_id) {
> -		ret = drm_intel_gem_context_get_id(mi_rpc_ctx, context_id);
> -		igt_assert_eq(ret, 0);
> -	}
> -
> -	igt_assert(mi_rpc_ctx);
> -	igt_assert(mi_rpc_bo);
> -	igt_assert(mi_rpc_batch);
> -
> -	ret = drm_intel_bo_map(mi_rpc_bo, true);
> -	igt_assert_eq(ret, 0);
> -	memset(mi_rpc_bo->virtual, 0x80, 4096);
> -	drm_intel_bo_unmap(mi_rpc_bo);
> -
> -	emit_report_perf_count(mi_rpc_batch,
> -			       mi_rpc_bo, /* dst */
> -			       0, /* dst offset in bytes */
> -			       0xdeadbeef); /* report ID */
> +	struct drm_i915_gem_execbuffer2 execbuf;
> +	struct drm_i915_gem_exec_object2 obj[2];
> +	struct drm_i915_gem_relocation_entry reloc;
> +	uint32_t *ptr, timestamp;
> +	struct timespec tv = {};
> +	int i;
>   
> -	intel_batchbuffer_flush_with_context(mi_rpc_batch, mi_rpc_ctx);
> +	memset(obj, 0, sizeof(obj));
> +	obj[0].handle = gem_create(drm_fd, 4096);
> +	ptr = gem_mmap__cpu(drm_fd, obj[0].handle, 0, 4096, PROT_WRITE);
> +	memset(ptr, 0x80, 4096);
> +	munmap(ptr, 4096);
> +
> +	obj[1].handle = gem_create(drm_fd, 4096);
> +	obj[1].relocs_ptr = to_user_pointer(&reloc);
> +	obj[1].relocation_count = 1;
> +	ptr = gem_mmap__cpu(drm_fd, obj[1].handle, 0, 4096, PROT_WRITE);
> +
> +	memset(&reloc, 0, sizeof(reloc));
> +	reloc.target_handle = obj[0].handle;
> +	reloc.offset = sizeof(uint32_t);
> +	reloc.read_domains = I915_GEM_DOMAIN_RENDER;
> +	reloc.write_domain = I915_GEM_DOMAIN_RENDER;
> +
> +	i = 2;
> +	ptr[0] = GEN6_MI_REPORT_PERF_COUNT;
> +	if (intel_gen(devid) >= 8)
> +		ptr[0]++, i++; /* 64b reloc */
> +	ptr[i++] = 0xdeadbeef;
> +	ptr[i] = MI_BATCH_BUFFER_END;
> +	munmap(ptr, 4096);
> +
> +	memset(&execbuf, 0, sizeof(execbuf));
> +	execbuf.buffers_ptr = to_user_pointer(obj);
> +	execbuf.buffer_count = 2;
> +	execbuf.batch_len = 4096;
> +	gem_execbuf(drm_fd, &execbuf);
> +	gem_close(drm_fd, obj[1].handle);
>   
> -	ret = drm_intel_bo_map(mi_rpc_bo, false /* write enable */);
> -	igt_assert_eq(ret, 0);
> -	timestamp = ((uint32_t *)mi_rpc_bo->virtual)[1];
> -	drm_intel_bo_unmap(mi_rpc_bo);
> +	/*
> +	 * MI_REPORT_PERF_COUNT is unserialised, i.e. not flushed by
> +	 * the PIPECONTROLs surrounding batch execution. Ergo, we must
> +	 * manually wait.
> +	 */
> +	do {
> +		gem_read(drm_fd, obj[0].handle, sizeof(uint32_t),
> +			 &timestamp, sizeof(timestamp));
> +	} while (timestamp == 0x80808080 && !igt_seconds_elapsed(&tv));
> +	gem_close(drm_fd, obj[0].handle);
>   
> -	drm_intel_bo_unreference(mi_rpc_bo);
> -	intel_batchbuffer_free(mi_rpc_batch);
> -	drm_intel_gem_context_destroy(mi_rpc_ctx);
> -	drm_intel_bufmgr_destroy(bufmgr);
> +	igt_assert_neq(timestamp, 0x80808080);
>   
>   	return timestamp;
>   }
> @@ -1915,7 +1927,6 @@ test_oa_exponents(void)
>   			uint32_t n_reports = 0;
>   			uint32_t n_idle_reports = 0;
>   			uint32_t n_reads = 0;
> -			uint32_t context_id;
>   			uint64_t first_timestamp = 0;
>   			bool check_first_timestamp = true;
>   			struct drm_i915_perf_record_header *header;
> @@ -1944,7 +1955,7 @@ test_oa_exponents(void)
>   			 * first timestamp as way to filter previously
>   			 * scheduled work that would have configured
>   			 * the OA unit at a different period. */
> -			first_timestamp = i915_get_one_gpu_timestamp(&context_id);
> +			first_timestamp = i915_get_one_gpu_timestamp();
>   
>   			while (n_reads < ARRAY_SIZE(reads) &&
>   			       n_reports < ARRAY_SIZE(reports)) {
> @@ -2070,8 +2081,8 @@ test_oa_exponents(void)
>   				uint32_t *rpt = NULL, *last = NULL, *last_periodic = NULL;
>   
>   				igt_debug(" > More than 5%% error: avg_ts_delta = %"PRIu64", delta_delta = %"PRIu64", "
> -					  "expected_delta = %"PRIu64", first_timestamp = %"PRIu64" ctx_id=%"PRIu32"\n",
> -					  average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp, context_id);
> +					  "expected_delta = %"PRIu64", first_timestamp = %"PRIu64"\n",
> +					  average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp);
>   				for (int i = 0; i < (n_reports - 1); i++) {
>   					/* XXX: calculating with u32 arithmetic to account for overflow */
>   					uint32_t u32_delta =