[Intel-gfx] [PATCH igt] igt/perf: Busywait for MI_REPORT_PERF_COUNT results
Lionel Landwerlin
lionel.g.landwerlin at intel.com
Fri Dec 8 15:08:49 UTC 2017
Hmm that sucks...
I'll bring this up with hardware people.
Maybe replacing this with a MI_STORE_REGISTER_MEM of the RCS timestamp
register (least significant 32bits) is a better approach.
On 08/12/17 14:31, Chris Wilson wrote:
> On Haswell, at least, MI_REPORT_PERF_COUNT is not flushed by the
> PIPECONTROL surrounding the batch. (In theory, before the breadcrumb is
> updated the CPU's view of memory is coherent with the GPU, i.e. all
> writes have landed and are visible to userspace. This does not appear to
> be the case for MI_REPORT_PERF_COUNT.)
>
> As MI_RPC does not apear to be synchronized with the batch, busyspin for
> its completion.
>
> (This has far deeper implications; since it means the GPU can still be
> writing to memory after release.)
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>
> ---
> tests/perf.c | 93 +++++++++++++++++++++++++++++++++---------------------------
> 1 file changed, 52 insertions(+), 41 deletions(-)
>
> diff --git a/tests/perf.c b/tests/perf.c
> index a161c45d7..8c20fbe09 100644
> --- a/tests/perf.c
> +++ b/tests/perf.c
> @@ -706,47 +706,59 @@ emit_report_perf_count(struct intel_batchbuffer *batch,
> }
>
> static uint32_t
> -i915_get_one_gpu_timestamp(uint32_t *context_id)
> +i915_get_one_gpu_timestamp(void)
> {
> - drm_intel_bufmgr *bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
> - drm_intel_context *mi_rpc_ctx = drm_intel_gem_context_create(bufmgr);
> - drm_intel_bo *mi_rpc_bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
> - struct intel_batchbuffer *mi_rpc_batch = intel_batchbuffer_alloc(bufmgr, devid);
> - int ret;
> - uint32_t timestamp;
> -
> - drm_intel_bufmgr_gem_enable_reuse(bufmgr);
> -
> - if (context_id) {
> - ret = drm_intel_gem_context_get_id(mi_rpc_ctx, context_id);
> - igt_assert_eq(ret, 0);
> - }
> -
> - igt_assert(mi_rpc_ctx);
> - igt_assert(mi_rpc_bo);
> - igt_assert(mi_rpc_batch);
> -
> - ret = drm_intel_bo_map(mi_rpc_bo, true);
> - igt_assert_eq(ret, 0);
> - memset(mi_rpc_bo->virtual, 0x80, 4096);
> - drm_intel_bo_unmap(mi_rpc_bo);
> -
> - emit_report_perf_count(mi_rpc_batch,
> - mi_rpc_bo, /* dst */
> - 0, /* dst offset in bytes */
> - 0xdeadbeef); /* report ID */
> + struct drm_i915_gem_execbuffer2 execbuf;
> + struct drm_i915_gem_exec_object2 obj[2];
> + struct drm_i915_gem_relocation_entry reloc;
> + uint32_t *ptr, timestamp;
> + struct timespec tv = {};
> + int i;
>
> - intel_batchbuffer_flush_with_context(mi_rpc_batch, mi_rpc_ctx);
> + memset(obj, 0, sizeof(obj));
> + obj[0].handle = gem_create(drm_fd, 4096);
> + ptr = gem_mmap__cpu(drm_fd, obj[0].handle, 0, 4096, PROT_WRITE);
> + memset(ptr, 0x80, 4096);
> + munmap(ptr, 4096);
> +
> + obj[1].handle = gem_create(drm_fd, 4096);
> + obj[1].relocs_ptr = to_user_pointer(&reloc);
> + obj[1].relocation_count = 1;
> + ptr = gem_mmap__cpu(drm_fd, obj[1].handle, 0, 4096, PROT_WRITE);
> +
> + memset(&reloc, 0, sizeof(reloc));
> + reloc.target_handle = obj[0].handle;
> + reloc.offset = sizeof(uint32_t);
> + reloc.read_domains = I915_GEM_DOMAIN_RENDER;
> + reloc.write_domain = I915_GEM_DOMAIN_RENDER;
> +
> + i = 2;
> + ptr[0] = GEN6_MI_REPORT_PERF_COUNT;
> + if (intel_gen(devid) >= 8)
> + ptr[0]++, i++; /* 64b reloc */
> + ptr[i++] = 0xdeadbeef;
> + ptr[i] = MI_BATCH_BUFFER_END;
> + munmap(ptr, 4096);
> +
> + memset(&execbuf, 0, sizeof(execbuf));
> + execbuf.buffers_ptr = to_user_pointer(obj);
> + execbuf.buffer_count = 2;
> + execbuf.batch_len = 4096;
> + gem_execbuf(drm_fd, &execbuf);
> + gem_close(drm_fd, obj[1].handle);
>
> - ret = drm_intel_bo_map(mi_rpc_bo, false /* write enable */);
> - igt_assert_eq(ret, 0);
> - timestamp = ((uint32_t *)mi_rpc_bo->virtual)[1];
> - drm_intel_bo_unmap(mi_rpc_bo);
> + /*
> + * MI_REPORT_PERF_COUNT is unserialised, i.e. not flushed by
> + * the PIPECONTROLs surrounding batch execution. Ergo, we must
> + * manually wait.
> + */
> + do {
> + gem_read(drm_fd, obj[0].handle, sizeof(uint32_t),
> + ×tamp, sizeof(timestamp));
> + } while (timestamp == 0x80808080 && !igt_seconds_elapsed(&tv));
> + gem_close(drm_fd, obj[0].handle);
>
> - drm_intel_bo_unreference(mi_rpc_bo);
> - intel_batchbuffer_free(mi_rpc_batch);
> - drm_intel_gem_context_destroy(mi_rpc_ctx);
> - drm_intel_bufmgr_destroy(bufmgr);
> + igt_assert_neq(timestamp, 0x80808080);
>
> return timestamp;
> }
> @@ -1915,7 +1927,6 @@ test_oa_exponents(void)
> uint32_t n_reports = 0;
> uint32_t n_idle_reports = 0;
> uint32_t n_reads = 0;
> - uint32_t context_id;
> uint64_t first_timestamp = 0;
> bool check_first_timestamp = true;
> struct drm_i915_perf_record_header *header;
> @@ -1944,7 +1955,7 @@ test_oa_exponents(void)
> * first timestamp as way to filter previously
> * scheduled work that would have configured
> * the OA unit at a different period. */
> - first_timestamp = i915_get_one_gpu_timestamp(&context_id);
> + first_timestamp = i915_get_one_gpu_timestamp();
>
> while (n_reads < ARRAY_SIZE(reads) &&
> n_reports < ARRAY_SIZE(reports)) {
> @@ -2070,8 +2081,8 @@ test_oa_exponents(void)
> uint32_t *rpt = NULL, *last = NULL, *last_periodic = NULL;
>
> igt_debug(" > More than 5%% error: avg_ts_delta = %"PRIu64", delta_delta = %"PRIu64", "
> - "expected_delta = %"PRIu64", first_timestamp = %"PRIu64" ctx_id=%"PRIu32"\n",
> - average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp, context_id);
> + "expected_delta = %"PRIu64", first_timestamp = %"PRIu64"\n",
> + average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp);
> for (int i = 0; i < (n_reports - 1); i++) {
> /* XXX: calculating with u32 arithmetic to account for overflow */
> uint32_t u32_delta =
More information about the Intel-gfx
mailing list