[Intel-gfx] [PATCH i-g-t 05/11] tests/perf: rework oa-exponent test
Matthew Auld
matthew.william.auld at gmail.com
Thu Aug 10 13:15:15 UTC 2017
On 4 August 2017 at 12:20, Lionel Landwerlin
<lionel.g.landwerlin at intel.com> wrote:
> New issues that were discovered while making the tests work on Gen8+ :
>
> - we need to measure timings between periodic reports and discard all
> other kind of reports
>
> - it seems periodicity of the reports can be affected outside of RC6
> (frequency change), we can detect this by looking at the amount of
> clock cycles per timestamp deltas
I think this would be easier to review if we split this into two patches...
Also, somewhat worrying is that I've yet to see the oa-exponents test
pass on my BDW machine.
See here:
https://paste.fedoraproject.org/paste/SmOw7eHEGOTjrLsvPVcZOw/raw
>
> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
> ---
> tests/perf.c | 786 ++++++++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 594 insertions(+), 192 deletions(-)
>
> diff --git a/tests/perf.c b/tests/perf.c
> index 7f9bc66d..5c7a2a34 100644
> --- a/tests/perf.c
> +++ b/tests/perf.c
> @@ -28,6 +28,7 @@
> #include <fcntl.h>
> #include <inttypes.h>
> #include <errno.h>
> +#include <signal.h>
> #include <sys/stat.h>
> #include <sys/time.h>
> #include <sys/times.h>
> @@ -300,6 +301,25 @@ static uint32_t (*read_report_ticks)(uint32_t *report,
> static void (*sanity_check_reports)(uint32_t *oa_report0, uint32_t *oa_report1,
> enum drm_i915_oa_format format);
>
> +static bool
> +timestamp_delta_within(uint32_t delta,
> + uint32_t expected_delta,
> + uint32_t margin)
> +{
> + return delta >= (expected_delta - margin) &&
> + delta <= (expected_delta + margin);
> +}
> +
> +static bool
> +double_value_within(double value,
> + double expected,
> + double percent_margin)
> +{
> + return value >= (expected - expected * percent_margin / 100.0) &&
> + value <= (expected + expected * percent_margin / 100.0);
> +
> +}
> +
> static void
> __perf_close(int fd)
> {
> @@ -476,6 +496,20 @@ gen8_read_report_ticks(uint32_t *report, enum drm_i915_oa_format format)
> return report[3];
> }
>
> +static void
> +gen8_read_report_clock_ratios(uint32_t *report,
> + uint32_t *slice_freq_mhz,
> + uint32_t *unslice_freq_mhz)
> +{
> + uint32_t unslice_freq = report[0] & 0x1ff;
> + uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
> + uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
> + uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
> +
> + *slice_freq_mhz = (slice_freq * 16666) / 1000;
> + *unslice_freq_mhz = (unslice_freq * 16666) / 1000;
> +}
> +
> static const char *
> gen8_read_report_reason(const uint32_t *report)
> {
> @@ -498,29 +532,6 @@ gen8_read_report_reason(const uint32_t *report)
> return "unknown";
> }
>
> -static bool
> -oa_report_is_periodic(uint32_t oa_exponent, const uint32_t *report)
> -{
> - if (IS_HASWELL(devid)) {
> - /* For Haswell we don't have a documented report reason field
> - * (though empirically report[0] bit 10 does seem to correlate
> - * with a timer trigger reason) so we instead infer which
> - * reports are timer triggered by checking if the least
> - * significant bits are zero and the exponent bit is set.
> - */
> - uint32_t oa_exponent_mask = (1 << (oa_exponent + 1)) - 1;
> -
> - if ((report[1] & oa_exponent_mask) != (1 << oa_exponent))
> - return true;
> - } else {
> - if ((report[0] >> OAREPORT_REASON_SHIFT) &
> - OAREPORT_REASON_TIMER)
> - return true;
> - }
> -
> - return false;
> -}
> -
> static uint64_t
> timebase_scale(uint32_t u32_delta)
> {
> @@ -568,6 +579,29 @@ oa_exponent_to_ns(int exponent)
> }
>
> static bool
> +oa_report_is_periodic(uint32_t oa_exponent, const uint32_t *report)
> +{
> + if (IS_HASWELL(devid)) {
> + /* For Haswell we don't have a documented report reason field
> + * (though empirically report[0] bit 10 does seem to correlate
> + * with a timer trigger reason) so we instead infer which
> + * reports are timer triggered by checking if the least
> + * significant bits are zero and the exponent bit is set.
> + */
> + uint32_t oa_exponent_mask = (1 << (oa_exponent + 1)) - 1;
> +
> + if ((report[1] & oa_exponent_mask) == (1 << oa_exponent))
> + return true;
> + } else {
> + if ((report[0] >> OAREPORT_REASON_SHIFT) &
> + OAREPORT_REASON_TIMER)
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static bool
> oa_report_ctx_is_valid(uint32_t *report)
> {
> if (IS_HASWELL(devid)) {
> @@ -582,6 +616,128 @@ oa_report_ctx_is_valid(uint32_t *report)
> igt_assert(!"reached");
> }
>
> +static uint32_t
> +oa_report_get_ctx_id(uint32_t *report)
> +{
> + if (!oa_report_ctx_is_valid(report))
> + return 0xffffffff;
> + return report[2];
> +}
> +
> +static double
> +oa_reports_tick_per_period(uint32_t *report0, uint32_t *report1)
> +{
> + if (intel_gen(devid) < 8)
> + return 0.0;
> +
> + /* Measure the number GPU tick delta to timestamp delta. */
> + return (double) (report1[3] - report0[3]) /
> + (double) (report1[1] - report0[1]);
> +}
> +
> +static void
> +scratch_buf_memset(drm_intel_bo *bo, int width, int height, uint32_t color)
> +{
> + int ret;
> +
> + ret = drm_intel_bo_map(bo, true /* writable */);
> + igt_assert_eq(ret, 0);
> +
> + for (int i = 0; i < width * height; i++)
> + ((uint32_t *)bo->virtual)[i] = color;
> +
> + drm_intel_bo_unmap(bo);
> +}
> +
> +static void
> +scratch_buf_init(drm_intel_bufmgr *bufmgr,
> + struct igt_buf *buf,
> + int width, int height,
> + uint32_t color)
> +{
> + size_t stride = width * 4;
> + size_t size = stride * height;
> + drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
> +
> + scratch_buf_memset(bo, width, height, color);
> +
> + buf->bo = bo;
> + buf->stride = stride;
> + buf->tiling = I915_TILING_NONE;
> + buf->size = size;
> +}
> +
> +static void
> +emit_report_perf_count(struct intel_batchbuffer *batch,
> + drm_intel_bo *dst_bo,
> + int dst_offset,
> + uint32_t report_id)
> +{
> + if (IS_HASWELL(devid)) {
> + BEGIN_BATCH(3, 1);
> + OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
> + OUT_RELOC(dst_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> + dst_offset);
> + OUT_BATCH(report_id);
> + ADVANCE_BATCH();
> + } else {
> + /* XXX: NB: n dwords arg is actually magic since it internally
> + * automatically accounts for larger addresses on gen >= 8...
> + */
> + BEGIN_BATCH(3, 1);
> + OUT_BATCH(GEN8_MI_REPORT_PERF_COUNT);
> + OUT_RELOC(dst_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> + dst_offset);
> + OUT_BATCH(report_id);
> + ADVANCE_BATCH();
> + }
> +}
> +
> +static uint32_t
> +i915_get_one_gpu_timestamp(uint32_t *context_id)
> +{
> + drm_intel_bufmgr *bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
> + drm_intel_context *mi_rpc_ctx = drm_intel_gem_context_create(bufmgr);
> + drm_intel_bo *mi_rpc_bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
> + struct intel_batchbuffer *mi_rpc_batch = intel_batchbuffer_alloc(bufmgr, devid);
> + int ret;
> + uint32_t timestamp;
> +
> + drm_intel_bufmgr_gem_enable_reuse(bufmgr);
> +
> + if (context_id) {
> + ret = drm_intel_gem_context_get_id(mi_rpc_ctx, context_id);
> + igt_assert_eq(ret, 0);
> + }
> +
> + igt_assert(mi_rpc_ctx);
> + igt_assert(mi_rpc_bo);
> + igt_assert(mi_rpc_batch);
> +
> + ret = drm_intel_bo_map(mi_rpc_bo, true);
> + igt_assert_eq(ret, 0);
> + memset(mi_rpc_bo->virtual, 0x80, 4096);
> + drm_intel_bo_unmap(mi_rpc_bo);
> +
> + emit_report_perf_count(mi_rpc_batch,
> + mi_rpc_bo, /* dst */
> + 0, /* dst offset in bytes */
> + 0xdeadbeef); /* report ID */
> +
> + intel_batchbuffer_flush_with_context(mi_rpc_batch, mi_rpc_ctx);
> +
> + ret = drm_intel_bo_map(mi_rpc_bo, false /* write enable */);
> + igt_assert_eq(ret, 0);
> + timestamp = ((uint32_t *)mi_rpc_bo->virtual)[1];
> + drm_intel_bo_unmap(mi_rpc_bo);
> +
> + drm_intel_bo_unreference(mi_rpc_bo);
> + intel_batchbuffer_free(mi_rpc_batch);
> + drm_intel_gem_context_destroy(mi_rpc_ctx);
> + drm_intel_bufmgr_destroy(bufmgr);
> +
> + return timestamp;
> +}
>
> static void
> hsw_sanity_check_render_basic_reports(uint32_t *oa_report0, uint32_t *oa_report1,
> @@ -991,6 +1147,16 @@ gt_frequency_range_save(void)
> gt_max_freq_mhz = gt_max_freq_mhz_saved;
> }
>
> +static void wait_freq_settle(void)
> +{
> + struct timespec ts;
> +
> + /* FIXME: Lazy sleep without check. */
> + ts.tv_sec = 0;
> + ts.tv_nsec = 20000;
> + clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL);
> +}
The oa-exponents test is no longer a user of gt_frequency_pin, so why
are we doing this here? Also we just end up removing this later...
> +
> static void
> gt_frequency_pin(int gt_freq_mhz)
> {
> @@ -1005,6 +1171,8 @@ gt_frequency_pin(int gt_freq_mhz)
> }
> gt_min_freq_mhz = gt_freq_mhz;
> gt_max_freq_mhz = gt_freq_mhz;
> +
> + wait_freq_settle();
> }
>
> static void
> @@ -1084,7 +1252,6 @@ i915_read_reports_until_timestamp(enum drm_i915_oa_format oa_format,
> return total_len;
> }
>
> -
> /* CAP_SYS_ADMIN is required to open system wide metrics, unless the system
> * control parameter dev.i915.perf_stream_paranoid == 0 */
> static void
> @@ -1400,20 +1567,6 @@ open_and_read_2_oa_reports(int format_id,
> }
>
> static void
> -gen8_read_report_clock_ratios(uint32_t *report,
> - uint32_t *slice_freq_mhz,
> - uint32_t *unslice_freq_mhz)
> -{
> - uint32_t unslice_freq = report[0] & 0x1ff;
> - uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
> - uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
> - uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
> -
> - *slice_freq_mhz = (slice_freq * 16666) / 1000;
> - *unslice_freq_mhz = (unslice_freq * 16666) / 1000;
> -}
> -
> -static void
> print_reports(uint32_t *oa_report0, uint32_t *oa_report1, int fmt)
> {
> igt_debug("TIMESTAMP: 1st = %"PRIu32", 2nd = %"PRIu32", delta = %"PRIu32"\n",
> @@ -1600,128 +1753,445 @@ test_oa_formats(void)
> }
> }
>
> -static void
> -test_oa_exponents(int gt_freq_mhz)
> +
> +enum load {
> + LOW,
> + HIGH
> +};
> +
> +static struct load_helper {
> + int devid;
> + int has_ppgtt;
> + drm_intel_bufmgr *bufmgr;
> + drm_intel_context *context;
> + uint32_t context_id;
> + struct intel_batchbuffer *batch;
> + drm_intel_bo *target_buffer;
> + enum load load;
> + bool exit;
> + struct igt_helper_process igt_proc;
> + struct igt_buf src, dst;
> +} lh = { 0, };
> +
> +static void load_helper_signal_handler(int sig)
> {
> - uint32_t freq_margin;
> + if (sig == SIGUSR2)
> + lh.load = lh.load == LOW ? HIGH : LOW;
> + else
> + lh.exit = true;
> +}
>
> - /* This test tries to use the sysfs interface for pinning the GT
> - * frequency so we have another point of reference for comparing with
> - * the clock frequency as derived from OA reports.
> - *
> - * This test has been finicky to stabilise while the
> - * gt_min/max_freq_mhz files in sysfs don't seem to be a reliable
> - * mechanism for fixing the gpu frequency.
> - *
> - * Since these unit tests are focused on the OA unit not the ability to
> - * pin the frequency via sysfs we make the test account for pinning not
> - * being reliable and read back the current frequency for each
> - * iteration of this test to take this into account.
> - */
> - gt_frequency_pin(gt_freq_mhz);
> +#define LOAD_HELPER_PAUSE_USEC 500
> +#define LOAD_HELPER_BO_SIZE (16*1024*1024)
What do we need this for?
> +static void load_helper_set_load(enum load load)
> +{
> + igt_assert(lh.igt_proc.running);
>
> - igt_debug("Testing OA timer exponents with requested GT frequency = %dmhz\n",
> - gt_freq_mhz);
> + if (lh.load == load)
> + return;
>
> - /* allow a +- 10% error margin when checking that the frequency
> - * calculated from the OA reports matches the frequency according to
> - * sysfs.
> + lh.load = load;
> + kill(lh.igt_proc.pid, SIGUSR2);
> +}
> +
> +static void load_helper_run(enum load load)
> +{
> + /*
> + * FIXME fork helpers won't get cleaned up when started from within a
> + * subtest, so handle the case where it sticks around a bit too long.
> */
> - freq_margin = gt_freq_mhz * 0.1;
> + if (lh.igt_proc.running) {
> + load_helper_set_load(load);
> + return;
> + }
> +
> + lh.load = load;
> +
> + igt_fork_helper(&lh.igt_proc) {
> + signal(SIGUSR1, load_helper_signal_handler);
> + signal(SIGUSR2, load_helper_signal_handler);
> +
> + while (!lh.exit) {
> + int ret;
> +
> + render_copy(lh.batch,
> + lh.context,
> + &lh.src, 0, 0, 1920, 1080,
> + &lh.dst, 0, 0);
> +
> + intel_batchbuffer_flush_with_context(lh.batch,
> + lh.context);
> +
> + ret = drm_intel_gem_context_get_id(lh.context,
> + &lh.context_id);
> + igt_assert_eq(ret, 0);
> +
> + drm_intel_bo_wait_rendering(lh.dst.bo);
> +
> + /* Lower the load by pausing after every submitted
> + * write. */
> + if (lh.load == LOW)
> + usleep(LOAD_HELPER_PAUSE_USEC);
> + }
> + }
> +}
> +
> +static void load_helper_stop(void)
> +{
> + kill(lh.igt_proc.pid, SIGUSR1);
> + igt_assert(igt_wait_helper(&lh.igt_proc) == 0);
> +}
> +
> +static void load_helper_init(void)
> +{
> + int ret;
> +
> + lh.devid = intel_get_drm_devid(drm_fd);
> + lh.has_ppgtt = gem_uses_ppgtt(drm_fd);
> +
> + /* MI_STORE_DATA can only use GTT address on gen4+/g33 and needs
> + * snoopable mem on pre-gen6. Hence load-helper only works on gen6+, but
> + * that's also all we care about for the rps testcase*/
> + igt_assert(intel_gen(lh.devid) >= 6);
> + lh.bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
> + igt_assert(lh.bufmgr);
> +
> + drm_intel_bufmgr_gem_enable_reuse(lh.bufmgr);
> +
> + lh.context = drm_intel_gem_context_create(lh.bufmgr);
> + igt_assert(lh.context);
> +
> + lh.context_id = 0xffffffff;
> + ret = drm_intel_gem_context_get_id(lh.context, &lh.context_id);
> + igt_assert_eq(ret, 0);
> + igt_assert_neq(lh.context_id, 0xffffffff);
> +
> + lh.batch = intel_batchbuffer_alloc(lh.bufmgr, lh.devid);
> + igt_assert(lh.batch);
> +
> + scratch_buf_init(lh.bufmgr, &lh.dst, 1920, 1080, 0);
> + scratch_buf_init(lh.bufmgr, &lh.src, 1920, 1080, 0);
> +}
> +
> +static void load_helper_deinit(void)
load_helper_fini(void)
> +{
> + if (lh.igt_proc.running)
> + load_helper_stop();
> +
> + if (lh.src.bo)
> + drm_intel_bo_unreference(lh.src.bo);
> + if (lh.dst.bo)
> + drm_intel_bo_unreference(lh.dst.bo);
> +
> + if (lh.batch)
> + intel_batchbuffer_free(lh.batch);
> +
> + if (lh.context)
> + drm_intel_gem_context_destroy(lh.context);
> +
> + if (lh.bufmgr)
> + drm_intel_bufmgr_destroy(lh.bufmgr);
> +}
> +
> +static void
> +test_oa_exponents(void)
> +{
> + load_helper_init();
> + load_helper_run(HIGH);
>
> /* It's asking a lot to sample with a 160 nanosecond period and the
> * test can fail due to buffer overflows if it wasn't possible to
> * keep up, so we don't start from an exponent of zero...
> */
> - for (int i = 5; i < 20; i++) {
> - uint32_t expected_timestamp_delta;
> - uint32_t timestamp_delta;
> - uint32_t oa_report0[64];
> - uint32_t oa_report1[64];
> + for (int exponent = 5; exponent < 18; exponent++) {
> + uint64_t expected_timestamp_delta;
> uint32_t time_delta;
> - uint32_t clock_delta;
> - uint32_t freq;
> int n_tested = 0;
> - int n_freq_matches = 0;
> int n_time_delta_matches = 0;
>
> -#warning "XXX: it seems pretty odd that the time delta assertion failures centre around these exponents"
> - if (i == 6 || i == 7 || i == 8)
> - continue;
> -
> /* The exponent is effectively selecting a bit in the timestamp
> * to trigger reports on and so in practice we expect the raw
> * timestamp deltas for periodic reports to exactly match the
> * value of next bit.
> */
> - expected_timestamp_delta = 2 << i;
> + expected_timestamp_delta = 2UL << exponent;
>
> for (int j = 0; n_tested < 10 && j < 100; j++) {
> - int gt_freq_mhz_0, gt_freq_mhz_1;
> - uint32_t ticks0, ticks1;
> + uint64_t properties[] = {
> + /* Include OA reports in samples */
> + DRM_I915_PERF_PROP_SAMPLE_OA, true,
> +
> + /* OA unit configuration */
> + DRM_I915_PERF_PROP_OA_METRICS_SET, test_metric_set_id,
> + DRM_I915_PERF_PROP_OA_FORMAT, test_oa_format,
> + DRM_I915_PERF_PROP_OA_EXPONENT, exponent,
> + };
> + struct drm_i915_perf_open_param param = {
> + .flags = I915_PERF_FLAG_FD_CLOEXEC,
> + .num_properties = ARRAY_SIZE(properties) / 2,
> + .properties_ptr = to_user_pointer(properties),
> + };
> + int ret;
> + uint64_t average_timestamp_delta;
> + uint32_t n_reports = 0;
> + uint32_t n_report_lost = 0;
> + uint32_t n_idle_reports = 0;
> + uint32_t n_reads = 0;
> + uint32_t context_id;
> + uint64_t first_timestamp = 0;
> + bool check_first_timestamp = true;
> + struct drm_i915_perf_record_header *header;
> + uint64_t delta_delta;
> + struct {
> + uint32_t report[64];
> + } reports[30];
> + struct {
> + uint8_t *buf;
> + size_t len;
> + } reads[1000];
> + double error;
> + double tick_per_period;
> +
> + igt_debug("ITER %d: testing OA exponent %d,"
> + " expected ts delta = %"PRIu64" (%"PRIu64"ns/%.2fus/%.2fms)\n",
> + j, exponent,
> + expected_timestamp_delta,
> + oa_exponent_to_ns(exponent),
> + oa_exponent_to_ns(exponent) / 1000.0,
> + oa_exponent_to_ns(exponent) / (1000.0 * 1000.0));
>
> - gt_freq_mhz_0 = sysfs_read("gt_act_freq_mhz");
> + stream_fd = __perf_open(drm_fd, ¶m);
>
> - igt_debug("ITER %d: testing OA exponent %d (period = %"PRIu64"ns) with sysfs GT freq = %dmhz +- %u\n",
> - j, i,
> - oa_exponent_to_ns(i),
> - gt_freq_mhz_0, freq_margin);
> + /* Right after opening the OA stream, read a
> + * first timestamp as way to filter previously
> + * scheduled work that would have configured
> + * the OA unit at a different period. */
> + first_timestamp = i915_get_one_gpu_timestamp(&context_id);
>
> - open_and_read_2_oa_reports(test_oa_format,
> - i, /* exponent */
> - oa_report0,
> - oa_report1,
> - true); /* timer triggered
> - reports only */
> + while (n_reads < ARRAY_SIZE(reads) &&
> + n_reports < ARRAY_SIZE(reports)) {
> + const size_t buf_size = 1024 * 1024;
> + uint8_t *buf = reads[n_reads++].buf = calloc(1, buf_size);
>
> - gt_freq_mhz_1 = sysfs_read("gt_act_freq_mhz");
> + while ((ret = read(stream_fd, buf, buf_size)) < 0 &&
> + errno == EINTR)
> + ;
>
> - /* If it looks like the frequency has changed according
> - * to sysfs then skip looking at this pair of reports
> - */
> - if (gt_freq_mhz_0 != gt_freq_mhz_1) {
> - igt_debug("skipping OA reports pair due to GT frequency change according to sysfs\n");
> - continue;
> - }
> + /* We should never have no data. */
> + igt_assert(ret > 0);
> + reads[n_reads - 1].len = ret;
>
> - timestamp_delta = oa_report1[1] - oa_report0[1];
> - igt_assert_neq(timestamp_delta, 0);
> + igt_debug(" > read %i bytes\n", ret);
>
> - if (timestamp_delta == expected_timestamp_delta)
> - n_time_delta_matches++;
> - else {
> - igt_debug("timestamp delta mismatch: %"PRIu64"ns != expected %"PRIu64"ns, ts0 = %u/0x%x, ts1 = %u/0x%x\n",
> - timebase_scale(timestamp_delta),
> - timebase_scale(expected_timestamp_delta),
> - oa_report0[1], oa_report0[1],
> - oa_report1[1], oa_report1[1]);
> - print_reports(oa_report0, oa_report1, test_oa_format);
> - igt_assert(timestamp_delta <
> - (expected_timestamp_delta * 2));
> + for (int offset = 0;
> + offset < ret && n_reports < ARRAY_SIZE(reports);
> + offset += header->size) {
> + uint32_t *report;
> + double previous_tick_per_period;
> +
> + header = (void *)(buf + offset);
> +
> + if (header->type == DRM_I915_PERF_RECORD_OA_BUFFER_LOST) {
> + igt_assert(!"reached");
> + break;
> + }
> +
> + if (header->type == DRM_I915_PERF_RECORD_OA_REPORT_LOST) {
> + n_report_lost++;
> + n_reports = 0;
> + n_report_lost = 0;
> + n_idle_reports = 0;
> + for (int r = 0; r < n_reads; r++)
> + free(reads[r].buf);
> + n_reads = 0;
> + break;
> + }
> +
> + if (header->type != DRM_I915_PERF_RECORD_SAMPLE)
> + continue;
> +
> + report = (void *)(header + 1);
> +
> + /* Skip anything before the first
> + * timestamp, it might not be at the
> + * right periodic exponent. */
> + if (check_first_timestamp &&
> + report[1] < first_timestamp) {
> + igt_debug(" > Dropping ts=%u (prior %"PRIu64")\n",
> + report[1], first_timestamp);
> + continue;
> + }
> +
> + /* Once we've passed the first
> + * timestamp, no need to check. */
> + check_first_timestamp = false;
> +
> + if (!oa_report_ctx_is_valid(report))
> + n_idle_reports++;
> +
> + /* We only measure timestamps between
> + * periodic reports. */
> + if (!oa_report_is_periodic(exponent, report))
> + continue;
> +
> + igt_debug(" > write %i timestamp=%u\n", n_reports, report[1]);
> + memcpy(reports[n_reports].report, report,
> + sizeof(reports[n_reports].report));
> + n_reports++;
> +
> + previous_tick_per_period = tick_per_period;
> +
> + if (n_reports > 1) {
> + tick_per_period =
> + oa_reports_tick_per_period(reports[n_reports - 2].report,
> + reports[n_reports - 1].report);
> +
> + /* Dismiss the series of report
> + * if we notice clock frequency
> + * changes. */
> + if (!double_value_within(tick_per_period,
> + previous_tick_per_period, 5)) {
> + igt_debug("Noticed clock frequency change at ts=%u (%f / %f), "
> + "dropping reports and trying again\n",
> + report[1], previous_tick_per_period, tick_per_period);
> + n_reports = 0;
> + n_report_lost = 0;
> + n_idle_reports = 0;
> + for (int r = 0; r < n_reads; r++)
> + free(reads[r].buf);
> + n_reads = 0;
> + break;
> + }
> + }
> + }
> }
>
> - ticks0 = read_report_ticks(oa_report0, test_oa_format);
> - ticks1 = read_report_ticks(oa_report1, test_oa_format);
> - clock_delta = ticks1 - ticks0;
> + __perf_close(stream_fd);
> + igt_debug("closed stream\n");
>
> - time_delta = timebase_scale(timestamp_delta);
> + igt_assert_eq(n_reports, ARRAY_SIZE(reports));
>
> - freq = ((uint64_t)clock_delta * 1000) / time_delta;
> - igt_debug("ITER %d: time delta = %"PRIu32"(ns) clock delta = %"PRIu32" freq = %"PRIu32"(mhz)\n",
> - j, time_delta, clock_delta, freq);
> + average_timestamp_delta = 0;
> + for (int i = 0; i < (n_reports - 1); i++) {
> + /* XXX: calculating with u32 arithmetic to account for overflow */
> + uint32_t u32_delta = reports[i + 1].report[1] - reports[i].report[1];
>
> - if (freq < (gt_freq_mhz_1 + freq_margin) &&
> - freq > (gt_freq_mhz_1 - freq_margin))
> - n_freq_matches++;
> + average_timestamp_delta += u32_delta;
> + }
> + average_timestamp_delta /= (n_reports - 1);
> +
> + if (average_timestamp_delta > expected_timestamp_delta)
> + delta_delta = average_timestamp_delta - expected_timestamp_delta;
> + else
> + delta_delta = expected_timestamp_delta - average_timestamp_delta;
> + error = (delta_delta / (double)expected_timestamp_delta) * 100.0;
> +
> + time_delta = timebase_scale(average_timestamp_delta);
> +
> + igt_debug(" > Avg. time delta = %"PRIu32"(ns),"
> + " lost reports = %u, n idle reports = %u,"
> + " n reads = %u, error=%f\n",
> + time_delta, n_report_lost, n_idle_reports, n_reads, error);
> + if (error > 5) {
> + uint32_t *rpt = NULL, *last = NULL, *last_periodic = NULL;
> +
> + igt_debug(" > More than 5%% error: avg_ts_delta = %"PRIu64", delta_delta = %"PRIu64", "
> + "expected_delta = %"PRIu64", first_timestamp = %"PRIu64" ctx_id=%"PRIu32"\n",
> + average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp, context_id);
> + for (int i = 0; i < (n_reports - 1); i++) {
> + /* XXX: calculating with u32 arithmetic to account for overflow */
> + uint32_t u32_delta =
> + reports[i + 1].report[1] - reports[i].report[1];
> +
> + if (u32_delta > expected_timestamp_delta)
> + delta_delta = u32_delta - expected_timestamp_delta;
> + else
> + delta_delta = expected_timestamp_delta - u32_delta;
> + error = (delta_delta / (double)expected_timestamp_delta) * 100.0;
> +
> + igt_debug(" > ts=%u-%u timestamp delta from %2d to %2d = %-8u (error = %u%%, ctx_id = %x)\n",
> + reports[i + 1].report[1], reports[i].report[1],
> + i, i + 1, u32_delta, (unsigned)error,
> + oa_report_get_ctx_id(reports[i + 1].report));
> + }
> + for (int r = 0; r < n_reads; r++) {
> + igt_debug(" > read\n");
> + for (int offset = 0;
> + offset < reads[r].len;
> + offset += header->size) {
> + int counter_print = 1;
> + uint64_t a0 = 0, aN = 0;
> + double local_period = 0;
> +
> + header = (void *) &reads[r].buf[offset];
> +
> + if (header->type != DRM_I915_PERF_RECORD_SAMPLE) {
> + igt_debug(" > loss\n");
> + continue;
> + }
> +
> + rpt = (void *)(header + 1);
> +
> + if (last) {
> + a0 = gen8_read_40bit_a_counter(rpt, test_oa_format, 0) -
> + gen8_read_40bit_a_counter(last, test_oa_format, 0);
> + aN = gen8_read_40bit_a_counter(rpt, test_oa_format, 13) -
> + gen8_read_40bit_a_counter(last, test_oa_format, 13);
> + }
> +
> + if (last_periodic &&
> + oa_report_is_periodic(exponent, rpt)) {
> + local_period =
> + ((uint64_t) rpt[3] - last_periodic[3]) /
> + ((uint64_t) rpt[1] - last_periodic[1]);
> + }
> +
> + igt_debug(" > report ts=%u"
> + " ts_delta_last=%8u ts_delta_last_periodic=%8u is_timer=%i ctx_id=%8x gpu_ticks=%u period=%.2f A0=%lu A%i=%lu\n",
> + rpt[1],
> + (last != NULL) ? (rpt[1] - last[1]) : 0,
> + (last_periodic != NULL) ? (rpt[1] - last_periodic[1]) : 0,
> + oa_report_is_periodic(exponent, rpt),
> + oa_report_get_ctx_id(rpt),
> + (last != NULL) ? (rpt[3] - last[3]) : 0,
> + local_period,
> + a0, counter_print, aN);
counter_print is always 1...
> +
> + last = rpt;
> + if (oa_report_is_periodic(exponent, rpt))
> + last_periodic = rpt;
> + }
> + }
> +
> + igt_assert(!"reached");
> + }
> +
> + if (timestamp_delta_within(average_timestamp_delta,
> + expected_timestamp_delta,
> + expected_timestamp_delta * 0.05)) {
> + igt_debug(" > timestamp delta matching %"PRIu64"ns ~= expected %"PRIu64"! :)\n",
> + timebase_scale(average_timestamp_delta),
> + timebase_scale(expected_timestamp_delta));
> + n_time_delta_matches++;
> + } else {
> + igt_debug(" > timestamp delta mismatch: %"PRIu64"ns != expected %"PRIu64"ns\n",
> + timebase_scale(average_timestamp_delta),
> + timebase_scale(expected_timestamp_delta));
> + igt_assert(average_timestamp_delta <
> + (expected_timestamp_delta * 2));
> + }
>
> n_tested++;
> +
> + for (int r = 0; r < n_reads; r++)
> + free(reads[r].buf);
> }
>
> if (n_tested < 10)
> - igt_debug("sysfs frequency pinning too unstable for cross-referencing with OA derived frequency");
> + igt_debug("Too many test iterations had to be skipped\n");
> igt_assert_eq(n_tested, 10);
>
> - igt_debug("number of iterations with expected timestamp delta = %d\n",
> + igt_debug("Number of iterations with expected timestamp delta = %d\n",
> n_time_delta_matches);
>
> /* The HW doesn't give us any strict guarantee that the
> @@ -1730,21 +2200,12 @@ test_oa_exponents(int gt_freq_mhz)
> * so it a useful sanity check to assert quite strictly...
> */
> igt_assert(n_time_delta_matches >= 9);
> -
> - igt_debug("number of iterations with expected clock frequency = %d\n",
> - n_freq_matches);
> -
> - /* Don't assert the calculated frequency for extremely short
> - * durations.
> - *
> - * Allow some mismatches since can't be can't be sure about
> - * frequency changes between sysfs reads.
> - */
> - if (i > 3)
> - igt_assert(n_freq_matches >= 7);
> }
>
> gt_frequency_range_restore();
Don't we need to get rid of this also?
> +
> + load_helper_stop();
> + load_helper_deinit();
> }
>
> /* The OA exponent selects a timestamp counter bit to trigger reports on.
> @@ -2611,32 +3072,6 @@ test_disabled_read_error(void)
> }
>
> static void
> -emit_report_perf_count(struct intel_batchbuffer *batch,
> - drm_intel_bo *dst_bo,
> - int dst_offset,
> - uint32_t report_id)
> -{
> - if (IS_HASWELL(devid)) {
> - BEGIN_BATCH(3, 1);
> - OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
> - OUT_RELOC(dst_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> - dst_offset);
> - OUT_BATCH(report_id);
> - ADVANCE_BATCH();
> - } else {
> - /* XXX: NB: n dwords arg is actually magic since it internally
> - * automatically accounts for larger addresses on gen >= 8...
> - */
> - BEGIN_BATCH(3, 1);
> - OUT_BATCH(GEN8_MI_REPORT_PERF_COUNT);
> - OUT_RELOC(dst_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
> - dst_offset);
> - OUT_BATCH(report_id);
> - ADVANCE_BATCH();
> - }
> -}
> -
> -static void
> test_mi_rpc(void)
> {
> uint64_t properties[] = {
> @@ -2706,38 +3141,6 @@ test_mi_rpc(void)
> }
>
> static void
> -scratch_buf_memset(drm_intel_bo *bo, int width, int height, uint32_t color)
> -{
> - int ret;
> -
> - ret = drm_intel_bo_map(bo, true /* writable */);
> - igt_assert_eq(ret, 0);
> -
> - for (int i = 0; i < width * height; i++)
> - ((uint32_t *)bo->virtual)[i] = color;
> -
> - drm_intel_bo_unmap(bo);
> -}
> -
> -static void
> -scratch_buf_init(drm_intel_bufmgr *bufmgr,
> - struct igt_buf *buf,
> - int width, int height,
> - uint32_t color)
> -{
> - size_t stride = width * 4;
> - size_t size = stride * height;
> - drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
> -
> - scratch_buf_memset(bo, width, height, color);
> -
> - buf->bo = bo;
> - buf->stride = stride;
> - buf->tiling = I915_TILING_NONE;
> - buf->size = size;
> -}
> -
> -static void
> emit_stall_timestamp_and_rpc(struct intel_batchbuffer *batch,
> drm_intel_bo *dst,
> int timestamp_offset,
> @@ -3996,8 +4399,7 @@ igt_main
> igt_subtest("low-oa-exponent-permissions")
> test_low_oa_exponent_permissions();
> igt_subtest("oa-exponents") {
> - test_oa_exponents(450);
> - test_oa_exponents(550);
> + test_oa_exponents();
> }
>
> igt_subtest("per-context-mode-unprivileged") {
> --
> 2.13.3
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list