[Intel-gfx] [PATCH igt 01/10] igt/perf_pmu: Tighten semaphore-wait measurement

Mon Dec 4 17:42:29 UTC 2017

On 04/12/2017 09:27, Chris Wilson wrote:
> Record the before/after semaphore-wait values around the sleep to try to
> reduce the inaccuracy from scheduler delays. Previously, the samples
> were taken before submitting the batch and then after synchronising its
> completion. The measurement will then be the total that the semaphore
> was being sampled, but with the extra syscalls intervening may have
> drifted from the sleep duration. To further reduce the disparity, wait
> for the batch to start executing before taking our samples.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=104013
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   tests/perf_pmu.c | 54 ++++++++++++++++++++++++++++++++----------------------
>   1 file changed, 32 insertions(+), 22 deletions(-)
> 
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index cb59bf5d..e872f4e5 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -355,13 +355,13 @@ no_sema(int gem_fd, const struct intel_execution_engine2 *e, bool busy)
>   static void
>   sema_wait(int gem_fd, const struct intel_execution_engine2 *e)
>   {
> -	struct drm_i915_gem_relocation_entry reloc = { };
> -	struct drm_i915_gem_execbuffer2 eb = { };
> -	struct drm_i915_gem_exec_object2 obj[2];
> +	struct drm_i915_gem_relocation_entry reloc[2] = {};
> +	struct drm_i915_gem_exec_object2 obj[2] = {};
> +	struct drm_i915_gem_execbuffer2 eb = {};
>   	uint32_t bb_handle, obj_handle;
>   	unsigned long slept;
>   	uint32_t *obj_ptr;
> -	uint32_t batch[6];
> +	uint32_t batch[16];
>   	uint64_t val[2];
>   	int fd;
>   
> @@ -378,28 +378,35 @@ sema_wait(int gem_fd, const struct intel_execution_engine2 *e)
>   
>   	obj_ptr = gem_mmap__wc(gem_fd, obj_handle, 0, 4096, PROT_WRITE);
>   
> -	batch[0] = MI_SEMAPHORE_WAIT |
> +	batch[0] = MI_STORE_DWORD_IMM;
> +	batch[1] = sizeof(*obj_ptr);
> +	batch[2] = 0;
> +	batch[3] = 1;
> +	batch[4] = MI_SEMAPHORE_WAIT |
>   		   MI_SEMAPHORE_POLL |
>   		   MI_SEMAPHORE_SAD_GTE_SDD;
> -	batch[1] = 1;
> -	batch[2] = 0x0;
> -	batch[3] = 0x0;
> -	batch[4] = MI_NOOP;
> -	batch[5] = MI_BATCH_BUFFER_END;
> +	batch[5] = 1;
> +	batch[6] = 0x0;
> +	batch[7] = 0x0;
> +	batch[8] = MI_BATCH_BUFFER_END;
>   
>   	gem_write(gem_fd, bb_handle, 0, batch, sizeof(batch));
>   
> -	reloc.target_handle = obj_handle;
> -	reloc.offset = 2 * sizeof(uint32_t);
> -	reloc.read_domains = I915_GEM_DOMAIN_RENDER;
> +	reloc[0].target_handle = obj_handle;
> +	reloc[0].offset = 1 * sizeof(uint32_t);
> +	reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
> +	reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
> +	reloc[0].delta = sizeof(*obj_ptr);
>   
> -	memset(obj, 0, sizeof(obj));
> +	reloc[1].target_handle = obj_handle;
> +	reloc[1].offset = 6 * sizeof(uint32_t);
> +	reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
>   
>   	obj[0].handle = obj_handle;
>   
>   	obj[1].handle = bb_handle;
> -	obj[1].relocation_count = 1;
> -	obj[1].relocs_ptr = to_user_pointer(&reloc);
> +	obj[1].relocation_count = 2;
> +	obj[1].relocs_ptr = to_user_pointer(reloc);
>   
>   	eb.buffer_count = 2;
>   	eb.buffers_ptr = to_user_pointer(obj);
> @@ -413,18 +420,21 @@ sema_wait(int gem_fd, const struct intel_execution_engine2 *e)
>   
>   	fd = open_pmu(I915_PMU_ENGINE_SEMA(e->class, e->instance));
>   
> -	val[0] = pmu_read_single(fd);
> -
>   	gem_execbuf(gem_fd, &eb);
> +	do { /* wait for the batch to start executing */
> +		usleep(5e3);
> +	} while (!obj_ptr[1]);
> +	usleep(5e3); /* wait for the register sampling */
>   
> +	val[0] = pmu_read_single(fd);
>   	slept = measured_usleep(batch_duration_ns / 1000);
> +	val[1] = pmu_read_single(fd);
> +	igt_debug("slept %.3fms, sampled %.3fms\n",
> +		  slept*1e-6, (val[1] - val[0])*1e-6);
>   
> -	*obj_ptr = 1;
> -
> +	obj_ptr[0] = 1;
>   	gem_sync(gem_fd, bb_handle);
>   
> -	val[1] = pmu_read_single(fd);
> -
>   	munmap(obj_ptr, 4096);
>   	gem_close(gem_fd, obj_handle);
>   	gem_close(gem_fd, bb_handle);
> 

My first try would be to just sample around usleep, but not complaints 
on the more involved solution either.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko