[Intel-gfx] [PATCH i-g-t 02/11] tests/perf: add per context filtering test for gen8+

Tue Aug 22 11:45:47 UTC 2017

On 08/08/17 15:21, Matthew Auld wrote:
> On 4 August 2017 at 12:20, Lionel Landwerlin
> <lionel.g.landwerlin at intel.com> wrote:
>> From: Robert Bragg <robert at sixbynine.org>
>>
>> Signed-off-by: Robert Bragg <robert at sixbynine.org>
>> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin at intel.com>
>> ---
>>   tests/perf.c | 806 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 768 insertions(+), 38 deletions(-)
>>
>> diff --git a/tests/perf.c b/tests/perf.c
>> index 26581ce4..279ff0c6 100644
>> --- a/tests/perf.c
>> +++ b/tests/perf.c
>> @@ -48,7 +48,9 @@ IGT_TEST_DESCRIPTION("Test the i915 perf metrics streaming interface");
>>   #define OAREPORT_REASON_MASK           0x3f
>>   #define OAREPORT_REASON_SHIFT          19
>>   #define OAREPORT_REASON_TIMER          (1<<0)
>> +#define OAREPORT_REASON_INTERNAL       (3<<1)
>>   #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
>> +#define OAREPORT_REASON_GO             (1<<4)
>>   #define OAREPORT_REASON_CLK_RATIO      (1<<5)
>>
>>   #define GFX_OP_PIPE_CONTROL     ((3 << 29) | (3 << 27) | (2 << 24))
>> @@ -565,6 +567,22 @@ oa_exponent_to_ns(int exponent)
>>          return 1000000000ULL * (2ULL << exponent) / timestamp_frequency;
>>   }
>>
>> +static bool
>> +oa_report_ctx_is_valid(uint32_t *report)
>> +{
>> +       if (IS_HASWELL(devid)) {
>> +               return false; /* TODO */
>> +       } else if (IS_GEN8(devid)) {
>> +               return report[0] & (1ul << 25);
>> +       } else if (IS_GEN9(devid)) {
>> +               return report[0] & (1ul << 16);
>> +       }
>> +
>> +       /* Need to update this function for newer Gen. */
>> +       igt_assert(!"reached");
>> +}
>> +
>> +
>>   static void
>>   hsw_sanity_check_render_basic_reports(uint32_t *oa_report0, uint32_t *oa_report1,
>>                                        enum drm_i915_oa_format fmt)
>> @@ -669,6 +687,100 @@ gen8_40bit_a_delta(uint64_t value0, uint64_t value1)
>>                  return value1 - value0;
>>   }
>>
>> +static void
>> +accumulate_uint32(size_t offset,
>> +                 uint32_t *report0,
>> +                  uint32_t *report1,
>> +                  uint64_t *delta)
>> +{
>> +       uint32_t value0 = *(uint32_t *)(((uint8_t *)report0) + offset);
>> +       uint32_t value1 = *(uint32_t *)(((uint8_t *)report1) + offset);
>> +
>> +       *delta += (uint32_t)(value1 - value0);
>> +}
>> +
>> +static void
>> +accumulate_uint40(int a_index,
>> +                  uint32_t *report0,
>> +                  uint32_t *report1,
>> +                 enum drm_i915_oa_format format,
>> +                  uint64_t *delta)
>> +{
>> +       uint64_t value0 = gen8_read_40bit_a_counter(report0, format, a_index),
>> +                value1 = gen8_read_40bit_a_counter(report1, format, a_index);
>> +
>> +       *delta += gen8_40bit_a_delta(value0, value1);
>> +}
>> +
>> +static void
>> +accumulate_reports(struct accumulator *accumulator,
>> +                  uint32_t *start,
>> +                  uint32_t *end)
>> +{
>> +       enum drm_i915_oa_format format = accumulator->format;
>> +       uint64_t *deltas = accumulator->deltas;
>> +       int idx = 0;
>> +
>> +       if (intel_gen(devid) >= 8) {
>> +               /* timestamp */
>> +               accumulate_uint32(4, start, end, deltas + idx++);
>> +
>> +               /* clock cycles */
>> +               accumulate_uint32(12, start, end, deltas + idx++);
>> +       } else {
>> +               /* timestamp */
>> +               accumulate_uint32(4, start, end, deltas + idx++);
>> +       }
>> +
>> +       for (int i = 0; i < oa_formats[format].n_a40; i++)
>> +               accumulate_uint40(i, start, end, format, deltas + idx++);
>> +
>> +       for (int i = 0; i < oa_formats[format].n_a; i++) {
>> +               accumulate_uint32(oa_formats[format].a_off + 4 * i,
>> +                                 start, end, deltas + idx++);
>> +       }
>> +
>> +       for (int i = 0; i < oa_formats[format].n_b; i++) {
>> +               accumulate_uint32(oa_formats[format].b_off + 4 * i,
>> +                                 start, end, deltas + idx++);
>> +       }
>> +
>> +       for (int i = 0; i < oa_formats[format].n_c; i++) {
>> +               accumulate_uint32(oa_formats[format].c_off + 4 * i,
>> +                                 start, end, deltas + idx++);
>> +       }
>> +}
>> +
>> +static void
>> +accumulator_print(struct accumulator *accumulator, const char *title)
>> +{
>> +       enum drm_i915_oa_format format = accumulator->format;
>> +       uint64_t *deltas = accumulator->deltas;
>> +       int idx = 0;
>> +
>> +       igt_debug("%s:\n", title);
>> +       if (intel_gen(devid) >= 8) {
>> +               igt_debug("\ttime delta = %lu\n", deltas[idx++]);
>> +               igt_debug("\tclock cycle delta = %lu\n", deltas[idx++]);
>> +
>> +               for (int i = 0; i < oa_formats[format].n_a40; i++)
>> +                       igt_debug("\tA%u = %lu\n", i, deltas[idx++]);
>> +       } else {
>> +               igt_debug("\ttime delta = %lu\n", deltas[idx++]);
>> +       }
>> +
>> +       for (int i = 0; i < oa_formats[format].n_a; i++) {
>> +               int a_id = oa_formats[format].first_a + i;
>> +               igt_debug("\tA%u = %lu\n", a_id, deltas[idx++]);
>> +       }
>> +
>> +       for (int i = 0; i < oa_formats[format].n_a; i++)
>> +               igt_debug("\tB%u = %lu\n", i, deltas[idx++]);
>> +
>> +       for (int i = 0; i < oa_formats[format].n_c; i++)
>> +               igt_debug("\tC%u = %lu\n", i, deltas[idx++]);
>> +}
>> +
>>   /* The TestOa metric set is designed so */
>>   static void
>>   gen8_sanity_check_test_oa_reports(uint32_t *oa_report0, uint32_t *oa_report1,
>> @@ -917,6 +1029,62 @@ gt_frequency_range_restore(void)
>>          gt_max_freq_mhz = gt_max_freq_mhz_saved;
>>   }
>>
>> +static int
>> +i915_read_reports_until_timestamp(enum drm_i915_oa_format oa_format,
>> +                                 uint8_t *buf,
>> +                                 uint32_t max_size,
>> +                                 uint32_t start_timestamp,
>> +                                 uint32_t end_timestamp)
>> +{
>> +       size_t format_size = oa_formats[oa_format].size;
>> +       uint32_t last_seen_timestamp = start_timestamp;
>> +       int total_len = 0;
>> +
>> +       while (last_seen_timestamp < end_timestamp) {
>> +               int offset, len;
>> +
>> +               /* Running out of space. */
>> +               if ((max_size - total_len) < format_size) {
>> +                       igt_warn("run out of space before reaching "
>> +                                "end timestamp (%u/%u)\n",
>> +                                last_seen_timestamp, end_timestamp);
>> +                       return -1;
>> +               }
>> +
>> +               while ((len = read(stream_fd, &buf[total_len],
>> +                                  max_size - total_len)) < 0 &&
>> +                      errno == EINTR)
>> +                       ;
>> +
>> +               /* Intentionally return an error. */
>> +               if (len <= 0) {
>> +                       if (errno == EAGAIN)
>> +                               return total_len;
>> +                       else {
>> +                               igt_warn("error read OA stream : %i\n", errno);
>> +                               return -1;
>> +                       }
>> +               }
>> +
>> +               offset = total_len;
>> +               total_len += len;
>> +
>> +               while (offset < total_len) {
>> +                 const struct drm_i915_perf_record_header *header =
>> +                   (const struct drm_i915_perf_record_header *) &buf[offset];
>> +                 uint32_t *report = (uint32_t *) (header + 1);
>> +
>> +                 if (header->type == DRM_I915_PERF_RECORD_SAMPLE)
>> +                   last_seen_timestamp = report[1];
>> +
>> +                 offset += header->size;
>> +               }
>> +       }
>> +
>> +       return total_len;
>> +}
>> +
>> +
>>   /* CAP_SYS_ADMIN is required to open system wide metrics, unless the system
>>    * control parameter dev.i915.perf_stream_paranoid == 0 */
>>   static void
>> @@ -1336,6 +1504,66 @@ print_reports(uint32_t *oa_report0, uint32_t *oa_report1, int fmt)
>>   }
>>
>>   static void
>> +print_report(uint32_t *report, int fmt)
>> +{
> I get an unused warning for this...

Useful for really precise debugging. Putting under ifdef

>
>
>> +       igt_debug("TIMESTAMP: %"PRIu32"\n", report[1]);
>> +
>> +       if (IS_HASWELL(devid) && oa_formats[fmt].n_c == 0) {
>> +               igt_debug("CLOCK = N/A\n");
>> +       } else {
>> +               uint32_t clock = read_report_ticks(report, fmt);
>> +
>> +               igt_debug("CLOCK: %"PRIu32"\n", clock);
>> +       }
>> +
>> +       if (intel_gen(devid) >= 8) {
>> +               uint32_t slice_freq, unslice_freq;
>> +               const char *reason = gen8_read_report_reason(report);
>> +
>> +               gen8_read_report_clock_ratios(report, &slice_freq, &unslice_freq);
>> +
>> +               igt_debug("SLICE CLK: %umhz\n", slice_freq);
>> +               igt_debug("UNSLICE CLK: %umhz\n", unslice_freq);
>> +               igt_debug("REASON: \"%s\"\n", reason);
>> +               igt_debug("CTX ID: %"PRIu32"/%"PRIx32"\n", report[2], report[2]);
>> +       }
>> +
>> +       /* Gen8+ has some 40bit A counters... */
>> +       for (int j = 0; j < oa_formats[fmt].n_a40; j++) {
>> +               uint64_t value = gen8_read_40bit_a_counter(report, fmt, j);
>> +
>> +               if (undefined_a_counters[j])
>> +                       continue;
>> +
>> +               igt_debug("A%d: %"PRIu64"\n", j, value);
>> +       }
>> +
>> +       for (int j = 0; j < oa_formats[fmt].n_a; j++) {
>> +               uint32_t *a = (uint32_t *)(((uint8_t *)report) +
>> +                                          oa_formats[fmt].a_off);
>> +               int a_id = oa_formats[fmt].first_a + j;
>> +
>> +               if (undefined_a_counters[a_id])
>> +                       continue;
>> +
>> +               igt_debug("A%d: %"PRIu32"\n", a_id, a[j]);
>> +       }
>> +
>> +       for (int j = 0; j < oa_formats[fmt].n_b; j++) {
>> +               uint32_t *b = (uint32_t *)(((uint8_t *)report) +
>> +                                          oa_formats[fmt].b_off);
>> +
>> +               igt_debug("B%d: %"PRIu32"\n", j, b[j]);
>> +       }
>> +
>> +       for (int j = 0; j < oa_formats[fmt].n_c; j++) {
>> +               uint32_t *c = (uint32_t *)(((uint8_t *)report) +
>> +                                          oa_formats[fmt].c_off);
>> +
>> +               igt_debug("C%d: %"PRIu32"\n", j, c[j]);
>> +       }
>> +}
>> +static void
>>   test_oa_formats(void)
>>   {
>>          for (int i = 0; i < ARRAY_SIZE(oa_formats); i++) {
>> @@ -1415,6 +1643,11 @@ test_oa_exponents(int gt_freq_mhz)
>>                  uint32_t freq;
>>                  int n_tested = 0;
>>                  int n_freq_matches = 0;
>> +               int n_time_delta_matches = 0;
>> +
>> +#warning "XXX: it seems pretty odd that the time delta assertion failures centre around these exponents"
>> +               if (i == 6 || i == 7 || i == 8)
>> +                       continue;
> Why do we need this? Also does this belong in this patch?

Thanks, removing this.

>
>>                  /* The exponent is effectively selecting a bit in the timestamp
>>                   * to trigger reports on and so in practice we expect the raw
>> @@ -1454,15 +1687,19 @@ test_oa_exponents(int gt_freq_mhz)
>>                          timestamp_delta = oa_report1[1] - oa_report0[1];
>>                          igt_assert_neq(timestamp_delta, 0);
>>
>> -                       if (timestamp_delta != expected_timestamp_delta) {
>> -                               igt_debug("timestamp0 = %u/0x%x\n",
>> -                                         oa_report0[1], oa_report0[1]);
>> -                               igt_debug("timestamp1 = %u/0x%x\n",
>> +                       if (timestamp_delta == expected_timestamp_delta)
>> +                               n_time_delta_matches++;
>> +                       else {
>> +                               igt_debug("timestamp delta mismatch: %"PRIu64"ns != expected %"PRIu64"ns, ts0 = %u/0x%x, ts1 = %u/0x%x\n",
>> +                                         timebase_scale(timestamp_delta),
>> +                                         timebase_scale(expected_timestamp_delta),
>> +                                         oa_report0[1], oa_report0[1],
>>                                            oa_report1[1], oa_report1[1]);
>> +                               print_reports(oa_report0, oa_report1, test_oa_format);
>> +                               igt_assert(timestamp_delta <
>> +                                          (expected_timestamp_delta * 2));
>>                          }
>>
>> -                       igt_assert_eq(timestamp_delta, expected_timestamp_delta);
>> -
>>                          ticks0 = read_report_ticks(oa_report0, test_oa_format);
>>                          ticks1 = read_report_ticks(oa_report1, test_oa_format);
>>                          clock_delta = ticks1 - ticks0;
>> @@ -1484,6 +1721,16 @@ test_oa_exponents(int gt_freq_mhz)
>>                          igt_debug("sysfs frequency pinning too unstable for cross-referencing with OA derived frequency");
>>                  igt_assert_eq(n_tested, 10);
>>
>> +               igt_debug("number of iterations with expected timestamp delta = %d\n",
>> +                         n_time_delta_matches);
>> +
>> +               /* The HW doesn't give us any strict guarantee that the
>> +                * timestamps are exactly aligned with the exponent mask but
>> +                * in practice it seems very rare for that not to be the case
>> +                * so it a useful sanity check to assert quite strictly...
>> +                */
>> +               igt_assert(n_time_delta_matches >= 9);
>> +
>>                  igt_debug("number of iterations with expected clock frequency = %d\n",
>>                            n_freq_matches);
>>
>> @@ -2459,14 +2706,8 @@ test_mi_rpc(void)
>>   }
>>
>>   static void
>> -scratch_buf_init(drm_intel_bufmgr *bufmgr,
>> -                struct igt_buf *buf,
>> -                int width, int height,
>> -                uint32_t color)
>> +scratch_buf_memset(drm_intel_bo *bo, int width, int height, uint32_t color)
>>   {
>> -       size_t stride = width * 4;
>> -       size_t size = stride * height;
>> -       drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
>>          int ret;
>>
>>          ret = drm_intel_bo_map(bo, true /* writable */);
>> @@ -2476,6 +2717,19 @@ scratch_buf_init(drm_intel_bufmgr *bufmgr,
>>                  ((uint32_t *)bo->virtual)[i] = color;
>>
>>          drm_intel_bo_unmap(bo);
>> +}
>> +
>> +static void
>> +scratch_buf_init(drm_intel_bufmgr *bufmgr,
>> +                struct igt_buf *buf,
>> +                int width, int height,
>> +                uint32_t color)
>> +{
>> +       size_t stride = width * 4;
>> +       size_t size = stride * height;
>> +       drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
>> +
>> +       scratch_buf_memset(bo, width, height, color);
>>
>>          buf->bo = bo;
>>          buf->stride = stride;
>> @@ -2494,14 +2748,25 @@ emit_stall_timestamp_and_rpc(struct intel_batchbuffer *batch,
>>                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
>>                                     PIPE_CONTROL_WRITE_TIMESTAMP);
>>
>> -       BEGIN_BATCH(5, 1);
>> -       OUT_BATCH(GFX_OP_PIPE_CONTROL | (5 - 2));
>> -       OUT_BATCH(pipe_ctl_flags);
>> -       OUT_RELOC(dst, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
>> -                 timestamp_offset);
>> -       OUT_BATCH(0); /* imm lower */
>> -       OUT_BATCH(0); /* imm upper */
>> -       ADVANCE_BATCH();
>> +       if (intel_gen(devid) >= 8) {
>> +               BEGIN_BATCH(5, 1);
>> +               OUT_BATCH(GFX_OP_PIPE_CONTROL | (6 - 2));
>> +               OUT_BATCH(pipe_ctl_flags);
>> +               OUT_RELOC(dst, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
>> +                         timestamp_offset);
>> +               OUT_BATCH(0); /* imm lower */
>> +               OUT_BATCH(0); /* imm upper */
>> +               ADVANCE_BATCH();
>> +       } else {
>> +               BEGIN_BATCH(5, 1);
>> +               OUT_BATCH(GFX_OP_PIPE_CONTROL | (5 - 2));
>> +               OUT_BATCH(pipe_ctl_flags);
>> +               OUT_RELOC(dst, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
>> +                         timestamp_offset);
>> +               OUT_BATCH(0); /* imm lower */
>> +               OUT_BATCH(0); /* imm upper */
>> +               ADVANCE_BATCH();
>> +       }
>>
>>          emit_report_perf_count(batch, dst, report_dst_offset, report_id);
>>   }
>> @@ -2547,7 +2812,7 @@ hsw_test_single_ctx_counters(void)
>>                  drm_intel_bufmgr *bufmgr;
>>                  drm_intel_context *context0, *context1;
>>                  struct intel_batchbuffer *batch;
>> -               struct igt_buf src, dst;
>> +               struct igt_buf src[3], dst[3];
>>                  drm_intel_bo *bo;
>>                  uint32_t *report0_32, *report1_32;
>>                  uint64_t timestamp0_64, timestamp1_64;
>> @@ -2565,8 +2830,10 @@ hsw_test_single_ctx_counters(void)
>>                  bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
>>                  drm_intel_bufmgr_gem_enable_reuse(bufmgr);
>>
>> -               scratch_buf_init(bufmgr, &src, width, height, 0xff0000ff);
>> -               scratch_buf_init(bufmgr, &dst, width, height, 0x00ff00ff);
>> +               for (int i = 0; i < ARRAY_SIZE(src); i++) {
>> +                       scratch_buf_init(bufmgr, &src[i], width, height, 0xff0000ff);
>> +                       scratch_buf_init(bufmgr, &dst[i], width, height, 0x00ff00ff);
>> +               }
>>
>>                  batch = intel_batchbuffer_alloc(bufmgr, devid);
>>
>> @@ -2600,14 +2867,19 @@ hsw_test_single_ctx_counters(void)
>>                   */
>>                  render_copy(batch,
>>                              context0,
>> -                           &src, 0, 0, width, height,
>> -                           &dst, 0, 0);
>> +                           &src[0], 0, 0, width, height,
>> +                           &dst[0], 0, 0);
>>
>>                  ret = drm_intel_gem_context_get_id(context0, &ctx_id);
>>                  igt_assert_eq(ret, 0);
>>                  igt_assert_neq(ctx_id, 0xffffffff);
>>                  properties[1] = ctx_id;
>>
>> +               intel_batchbuffer_flush_with_context(batch, context0);
>> +
>> +               scratch_buf_memset(src[0].bo, width, height, 0xff0000ff);
>> +               scratch_buf_memset(dst[0].bo, width, height, 0x00ff00ff);
>> +
>>                  igt_debug("opening i915-perf stream\n");
>>                  stream_fd = __perf_open(drm_fd, &param);
>>
>> @@ -2634,8 +2906,8 @@ hsw_test_single_ctx_counters(void)
>>
>>                  render_copy(batch,
>>                              context0,
>> -                           &src, 0, 0, width, height,
>> -                           &dst, 0, 0);
>> +                           &src[0], 0, 0, width, height,
>> +                           &dst[0], 0, 0);
>>
>>                  /* Another redundant flush to clarify batch bo is free to reuse */
>>                  intel_batchbuffer_flush_with_context(batch, context0);
>> @@ -2646,13 +2918,13 @@ hsw_test_single_ctx_counters(void)
>>                   */
>>                  render_copy(batch,
>>                              context1,
>> -                           &src, 0, 0, width, height,
>> -                           &dst, 0, 0);
>> +                           &src[1], 0, 0, width, height,
>> +                           &dst[1], 0, 0);
>>
>>                  render_copy(batch,
>>                              context1,
>> -                           &src, 0, 0, width, height,
>> -                           &dst, 0, 0);
>> +                           &src[2], 0, 0, width, height,
>> +                           &dst[2], 0, 0);
>>
>>                  /* And another */
>>                  intel_batchbuffer_flush_with_context(batch, context1);
>> @@ -2681,6 +2953,7 @@ hsw_test_single_ctx_counters(void)
>>
>>                  /* A40 == N samples written to all render targets */
>>                  n_samples_written = report1_32[43] - report0_32[43];
>> +
>>                  igt_debug("n samples written = %d\n", n_samples_written);
>>                  igt_assert_eq(n_samples_written, width * height);
>>
>> @@ -2715,8 +2988,10 @@ hsw_test_single_ctx_counters(void)
>>                          (delta_oa32_ns - delta_ts64_ns);
>>                  igt_assert(delta_delta <= 320);
>>
>> -               drm_intel_bo_unreference(src.bo);
>> -               drm_intel_bo_unreference(dst.bo);
>> +               for (int i = 0; i < ARRAY_SIZE(src); i++) {
>> +                       drm_intel_bo_unreference(src[i].bo);
>> +                       drm_intel_bo_unreference(dst[i].bo);
>> +               }
>>
>>                  drm_intel_bo_unmap(bo);
>>                  drm_intel_bo_unreference(bo);
>> @@ -2730,6 +3005,454 @@ hsw_test_single_ctx_counters(void)
>>          igt_waitchildren();
>>   }
>>
>> +/* Tests the INTEL_performance_query use case where an unprivileged process
>> + * should be able to configure the OA unit for per-context metrics (for a
> Smells like gen7...

You can do per context filtering on gen8 without root permissions. Just 
needs :

sudo sysctl dev.i915.perf_stream_paranoid=0

>
>> + * context associated with that process' drm file descriptor) and the counters
>> + * should only relate to that specific context.
>> + *
>> + * For Gen8+ although reports read via i915 perf can be filtered for a single
>> + * context the counters themselves always progress as global/system-wide
>> + * counters affected by all contexts. To support the INTEL_performance_query
>> + * use case on Gen8+ it's necessary to combine OABUFFER and
>> + * MI_REPORT_PERF_COUNT reports so that counter normalisation can take into
>> + * account context-switch reports and factor out any counter progression not
>> + * associated with the current context.
>> + */
>> +static void
>> +gen8_test_single_ctx_render_target_writes_a_counter(void)
>> +{
>> +       int oa_exponent = max_oa_exponent_for_period_lte(1000000);
>> +       uint64_t properties[] = {
>> +               DRM_I915_PERF_PROP_CTX_HANDLE, UINT64_MAX, /* updated below */
>> +
>> +               /* Note: we have to specify at least one sample property even
>> +                * though we aren't interested in samples in this case
>> +                */
>> +               DRM_I915_PERF_PROP_SAMPLE_OA, true,
>> +
>> +               /* OA unit configuration */
>> +               DRM_I915_PERF_PROP_OA_METRICS_SET, test_metric_set_id,
>> +               DRM_I915_PERF_PROP_OA_FORMAT, test_oa_format,
>> +               DRM_I915_PERF_PROP_OA_EXPONENT, oa_exponent,
>> +
>> +               /* Note: no OA exponent specified in this case */
>> +       };
>> +       struct drm_i915_perf_open_param param = {
>> +               .flags = I915_PERF_FLAG_FD_CLOEXEC,
>> +               .num_properties = ARRAY_SIZE(properties) / 2,
>> +               .properties_ptr = to_user_pointer(properties),
>> +       };
>> +       size_t format_size = oa_formats[test_oa_format].size;
>> +       size_t sample_size = (sizeof(struct drm_i915_perf_record_header) +
>> +                             format_size);
>> +       int max_reports = (16 * 1024 * 1024) / format_size;
>> +       int buf_size = sample_size * max_reports * 1.5;
>> +       int child_ret;
>> +       uint8_t *buf = malloc(buf_size);
>> +       ssize_t len;
>> +       struct igt_helper_process child = {};
>> +
>> +       /* should be default, but just to be sure... */
>> +       write_u64_file("/proc/sys/dev/i915/perf_stream_paranoid", 1);
>> +
>> +       do {
>> +
>> +               igt_fork_helper(&child) {
> For this test why do we need concern ourselves with spawning a child
> process, it's not like we need to drop root?

Funny bit :

Rendercopy appears to fail sporadically (look for exit(EAGAIN))...

I didn't get to the bottom of this (a guess is that we're not 
configuring a unit properly, depth related stuff maybe?), so this test 
retries until it figures that work actually happened.
Hence the fork that is helpful for cleanup.

>
>> +                       struct drm_i915_perf_record_header *header;
>> +                       drm_intel_bufmgr *bufmgr;
>> +                       drm_intel_context *context0, *context1;
>> +                       struct intel_batchbuffer *batch;
>> +                       struct igt_buf src[3], dst[3];
>> +                       drm_intel_bo *bo;
>> +                       uint32_t *report0_32, *report1_32;
>> +                       uint32_t *prev, *lprev = NULL;
>> +                       uint64_t timestamp0_64, timestamp1_64;
>> +                       uint32_t delta_ts64, delta_oa32;
>> +                       uint64_t delta_ts64_ns, delta_oa32_ns;
>> +                       uint32_t delta_delta;
>> +                       int width = 800;
>> +                       int height = 600;
>> +                       uint32_t ctx_id = 0xffffffff; /* invalid handle */
>> +                       uint32_t ctx1_id = 0xffffffff;  /* invalid handle */
>> +                       uint32_t current_ctx_id = 0xffffffff;
>> +                       uint32_t n_invalid_ctx = 0;
>> +                       int ret;
>> +                       struct accumulator accumulator = {
>> +                               .format = test_oa_format
>> +                       };
>> +
>> +                       //igt_drop_root();
> Commented out code, elsewhere also...

This should go. It's already tested elsewhere.

>
>> +
>> +                       bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
>> +                       drm_intel_bufmgr_gem_enable_reuse(bufmgr);
>> +
>> +                       for (int i = 0; i < ARRAY_SIZE(src); i++) {
>> +                               scratch_buf_init(bufmgr, &src[i], width, height, 0xff0000ff);
>> +                               scratch_buf_init(bufmgr, &dst[i], width, height, 0x00ff00ff);
>> +                       }
>> +
>> +                       batch = intel_batchbuffer_alloc(bufmgr, devid);
>> +
>> +                       context0 = drm_intel_gem_context_create(bufmgr);
>> +                       igt_assert(context0);
>> +
>> +                       context1 = drm_intel_gem_context_create(bufmgr);
>> +                       igt_assert(context1);
>> +
>> +                       igt_debug("submitting warm up render_copy\n");
>> +
>> +                       /* Submit some early, unmeasured, work to the context we want
>> +                        * to measure to try and catch issues with i915-perf
>> +                        * initializing the HW context ID for filtering.
>> +                        *
>> +                        * We do this because i915-perf single context filtering had
>> +                        * previously only relied on a hook into context pinning to
>> +                        * initialize the HW context ID, instead of also trying to
>> +                        * determine the HW ID while opening the stream, in case it
>> +                        * has already been pinned.
>> +                        *
>> +                        * This wasn't noticed by the previous unit test because we
>> +                        * were opening the stream while the context hadn't been
>> +                        * touched or pinned yet and so it worked out correctly to wait
>> +                        * for the pinning hook.
>> +                        *
>> +                        * Now a buggy version of i915-perf will fail to measure
>> +                        * anything for context0 once this initial render_copy() ends
>> +                        * up pinning the context since there won't ever be a pinning
>> +                        * hook callback.
>> +                        */
>> +                       render_copy(batch,
>> +                                   context0,
>> +                                   &src[0], 0, 0, width, height,
>> +                                   &dst[0], 0, 0);
>> +
>> +                       ret = drm_intel_gem_context_get_id(context0, &ctx_id);
>> +                       igt_assert_eq(ret, 0);
>> +                       igt_assert_neq(ctx_id, 0xffffffff);
>> +                       properties[1] = ctx_id;
>> +
>> +                       scratch_buf_memset(src[0].bo, width, height, 0xff0000ff);
>> +                       scratch_buf_memset(dst[0].bo, width, height, 0x00ff00ff);
>> +
>> +                       igt_debug("opening i915-perf stream\n");
>> +                       stream_fd = __perf_open(drm_fd, &param);
>> +
>> +                       bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
>> +
>> +                       ret = drm_intel_bo_map(bo, true /* write enable */);
>> +                       igt_assert_eq(ret, 0);
>> +
>> +                       memset(bo->virtual, 0x80, 4096);
>> +                       drm_intel_bo_unmap(bo);
>> +
>> +                       emit_stall_timestamp_and_rpc(batch,
>> +                                                    bo,
>> +                                                    512 /* timestamp offset */,
>> +                                                    0, /* report dst offset */
>> +                                                    0xdeadbeef); /* report id */
>> +
>> +                       /* Explicitly flush here (even though the render_copy() call
>> +                        * will itself flush before/after the copy) to clarify that
>> +                        * that the PIPE_CONTROL + MI_RPC commands will be in a
>> +                        * separate batch from the copy.
>> +                        */
>> +                       intel_batchbuffer_flush_with_context(batch, context0);
>> +
>> +                       render_copy(batch,
>> +                                   context0,
>> +                                   &src[0], 0, 0, width, height,
>> +                                   &dst[0], 0, 0);
>> +
>> +                       /* Another redundant flush to clarify batch bo is free to reuse */
>> +                       intel_batchbuffer_flush_with_context(batch, context0);
>> +
>> +                       /* submit two copies on the other context to avoid a false
>> +                        * positive in case the driver somehow ended up filtering for
>> +                        * context1
>> +                        */
>> +                       render_copy(batch,
>> +                                   context1,
>> +                                   &src[1], 0, 0, width, height,
>> +                                   &dst[1], 0, 0);
>> +
>> +                       ret = drm_intel_gem_context_get_id(context1, &ctx1_id);
>> +                       igt_assert_eq(ret, 0);
>> +                       igt_assert_neq(ctx1_id, 0xffffffff);
>> +
>> +                       render_copy(batch,
>> +                                   context1,
>> +                                   &src[2], 0, 0, width, height,
>> +                                   &dst[2], 0, 0);
>> +
>> +                       /* And another */
>> +                       intel_batchbuffer_flush_with_context(batch, context1);
>> +
>> +                       emit_stall_timestamp_and_rpc(batch,
>> +                                                    bo,
>> +                                                    520 /* timestamp offset */,
>> +                                                    256, /* report dst offset */
>> +                                                    0xbeefbeef); /* report id */
>> +
>> +                       intel_batchbuffer_flush_with_context(batch, context1);
>> +
>> +                       ret = drm_intel_bo_map(bo, false /* write enable */);
>> +                       igt_assert_eq(ret, 0);
>> +
>> +                       report0_32 = bo->virtual;
>> +                       igt_assert_eq(report0_32[0], 0xdeadbeef); /* report ID */
>> +                       igt_assert_neq(report0_32[1], 0); /* timestamp */
>> +                       //report0_32[2] = 0xffffffff;
>> +                       prev = report0_32;
>> +                       ctx_id = prev[2];
>> +                       igt_debug("MI_RPC(start) CTX ID: %u\n", ctx_id);
>> +
>> +                       report1_32 = report0_32 + 64; /* 64 uint32_t = 256bytes offset */
>> +                       igt_assert_eq(report1_32[0], 0xbeefbeef); /* report ID */
>> +                       igt_assert_neq(report1_32[1], 0); /* timestamp */
>> +                       //report1_32[2] = 0xffffffff;
>> +                       ctx1_id = report1_32[2];
>> +
>> +                       memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
>> +                       accumulate_reports(&accumulator, report0_32, report1_32);
>> +                       igt_debug("total: A0 = %lu, A21 = %lu, A26 = %lu\n",
>> +                                 accumulator.deltas[2 + 0], /* skip timestamp + clock cycles */
>> +                                 accumulator.deltas[2 + 21],
>> +                                 accumulator.deltas[2 + 26]);
>> +
>> +                       igt_debug("oa_timestamp32 0 = %u\n", report0_32[1]);
>> +                       igt_debug("oa_timestamp32 1 = %u\n", report1_32[1]);
>> +                       igt_debug("ctx_id 0 = %u\n", report0_32[2]);
>> +                       igt_debug("ctx_id 1 = %u\n", report1_32[2]);
>> +
>> +                       timestamp0_64 = *(uint64_t *)(((uint8_t *)bo->virtual) + 512);
>> +                       timestamp1_64 = *(uint64_t *)(((uint8_t *)bo->virtual) + 520);
>> +
>> +                       igt_debug("ts_timestamp64 0 = %"PRIu64"\n", timestamp0_64);
>> +                       igt_debug("ts_timestamp64 1 = %"PRIu64"\n", timestamp1_64);
>> +
>> +                       delta_ts64 = timestamp1_64 - timestamp0_64;
>> +                       delta_oa32 = report1_32[1] - report0_32[1];
>> +
>> +                       /* sanity check that we can pass the delta to timebase_scale */
>> +                       igt_assert(delta_ts64 < UINT32_MAX);
>> +                       delta_oa32_ns = timebase_scale(delta_oa32);
>> +                       delta_ts64_ns = timebase_scale(delta_ts64);
>> +
>> +                       igt_debug("oa32 delta = %u, = %uns\n",
>> +                                 delta_oa32, (unsigned)delta_oa32_ns);
>> +                       igt_debug("ts64 delta = %u, = %uns\n",
>> +                                 delta_ts64, (unsigned)delta_ts64_ns);
>> +
>> +                       /* The delta as calculated via the PIPE_CONTROL timestamp or
>> +                        * the OA report timestamps should be almost identical but
>> +                        * allow a 320 nanoseconds margin.
> Looks like 500ns.

Indeed!

>
>> +                        */
>> +                       delta_delta = delta_ts64_ns > delta_oa32_ns ?
>> +                               (delta_ts64_ns - delta_oa32_ns) :
>> +                               (delta_oa32_ns - delta_ts64_ns);
>> +                       if (delta_delta > 500) {
>> +                               igt_debug("skipping\n");
>> +                               exit(EAGAIN);
>> +                       }
>> +
>> +                       len = i915_read_reports_until_timestamp(test_oa_format,
>> +                                                               buf, buf_size,
>> +                                                               report0_32[1],
>> +                                                               report1_32[1]);
>> +
>> +                       igt_assert(len > 0);
>> +                       igt_debug("read %d bytes\n", (int)len);
>> +
>> +                       memset(accumulator.deltas, 0, sizeof(accumulator.deltas));
>> +
>> +                       for (size_t offset = 0; offset < len; offset += header->size) {
>> +                               uint32_t *report;
>> +                               uint32_t reason;
>> +                               const char *skip_reason = NULL, *report_reason = NULL;
>> +                               struct accumulator laccumulator = {
>> +                                       .format = test_oa_format
>> +                               };
>> +
>> +
>> +                               header = (void *)(buf + offset);
>> +
>> +                               igt_assert_eq(header->pad, 0); /* Reserved */
>> +
>> +                               /* Currently the only test that should ever expect to
>> +                                * see a _BUFFER_LOST error is the buffer_fill test,
>> +                                * otherwise something bad has probably happened...
>> +                                */
>> +                               igt_assert_neq(header->type, DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
>> +
>> +                               /* At high sampling frequencies the OA HW might not be
>> +                                * able to cope with all write requests and will notify
>> +                                * us that a report was lost.
>> +                                *
>> +                                * XXX: we should maybe restart the test in this case?
>> +                                */
>> +                               if (header->type == DRM_I915_PERF_RECORD_OA_REPORT_LOST) {
>> +                                       igt_debug("OA trigger collision / report lost\n");
>> +                                       exit(EAGAIN);
>> +                               }
>> +
>> +                               /* Currently the only other record type expected is a
>> +                                * _SAMPLE. Notably this test will need updating if
>> +                                * i915-perf is extended in the future with additional
>> +                                * record types.
>> +                                */
>> +                               igt_assert_eq(header->type, DRM_I915_PERF_RECORD_SAMPLE);
>> +
>> +                               igt_assert_eq(header->size, sample_size);
>> +
>> +                               report = (void *)(header + 1);
>> +
>> +                               /* Don't expect zero for timestamps */
>> +                               igt_assert_neq(report[1], 0);
>> +
>> +                               igt_debug("report %p:\n", report);
>> +
>> +                               /* Discard reports not contained in between the
>> +                                * timestamps we're looking at. */
>> +                               {
>> +                                       uint32_t time_delta = report[1] - report0_32[1];
>> +
>> +                                       if (timebase_scale(time_delta) > 1000000000) {
>> +                                               skip_reason = "prior first mi-rpc";
>> +                                       }
>> +                               }
>> +
>> +                               {
>> +                                       uint32_t time_delta = report[1] - report1_32[1];
>> +
>> +                                       if (timebase_scale(time_delta) <= 1000000000) {
>> +                                               igt_debug("    comes after last MI_RPC (%u)\n",
>> +                                                         report1_32[1]);
>> +                                               report = report1_32;
>> +                                       }
>> +                               }
>> +
>> +                               /* Print out deltas for a few significant
>> +                                * counters for each report. */
>> +                               if (lprev) {
>> +                                       memset(laccumulator.deltas, 0, sizeof(laccumulator.deltas));
>> +                                       accumulate_reports(&laccumulator, lprev, report);
>> +                                       igt_debug("    deltas: A0=%lu A21=%lu, A26=%lu\n",
>> +                                                 laccumulator.deltas[2 + 0], /* skip timestamp + clock cycles */
>> +                                                 laccumulator.deltas[2 + 21],
>> +                                                 laccumulator.deltas[2 + 26]);
>> +                               }
>> +                               lprev = report;
>> +
>> +                               /* Print out reason for the report. */
>> +                               reason = ((report[0] >> OAREPORT_REASON_SHIFT) &
>> +                                         OAREPORT_REASON_MASK);
>> +
>> +                               if (reason & OAREPORT_REASON_CTX_SWITCH) {
>> +                                       report_reason = "ctx-load";
>> +                               } else if (reason & OAREPORT_REASON_TIMER) {
>> +                                       report_reason = "timer";
>> +                               } else if (reason & OAREPORT_REASON_INTERNAL ||
>> +                                          reason & OAREPORT_REASON_GO ||
>> +                                          reason & OAREPORT_REASON_CLK_RATIO) {
>> +                                       report_reason = "internal/go/clk-ratio";
>> +                               } else {
>> +                                       report_reason = "end-mi-rpc";
>> +                               }
>> +                               igt_debug("    ctx_id=%u/%x reason=%s oa_timestamp32=%u\n",
>> +                                         report[2], report[2], report_reason, report[1]);
>> +
>> +                               /* Should we skip this report?
>> +                                *
>> +                                *   Only if the current context id of
>> +                                *   the stream is not the one we want
>> +                                *   to measure.
>> +                                */
>> +                               if (current_ctx_id != ctx_id) {
>> +                                       skip_reason = "not our context";
>> +                               }
>> +
>> +                               if (n_invalid_ctx > 1) {
>> +                                       skip_reason = "too many invalid context events";
>> +                               }
>> +
>> +                               if (!skip_reason) {
>> +                                       accumulate_reports(&accumulator, prev, report);
>> +                                       igt_debug(" -> Accumulated deltas A0=%lu A21=%lu, A26=%lu\n",
>> +                                                 accumulator.deltas[2 + 0], /* skip timestamp + clock cycles */
>> +                                                 accumulator.deltas[2 + 21],
>> +                                                 accumulator.deltas[2 + 26]);
>> +                               } else {
>> +                                       igt_debug(" -> Skipping: %s\n", skip_reason);
>> +                               }
>> +
>> +
>> +                               /* Finally update current-ctx_id, only possible
>> +                                * with a valid contextOB id. */
> What's a contextOB?

typo :(

>
>> +                               if (oa_report_ctx_is_valid(report)) {
>> +                                       current_ctx_id = report[2];
>> +                                       n_invalid_ctx = 0;
>> +                               } else {
>> +                                       n_invalid_ctx++;
>> +                               }
>> +
>> +                               prev = report;
>> +
>> +                               if (report == report1_32) {
>> +                                       igt_debug("Breaking on end of report\n");
>> +                                       print_reports(report0_32, report1_32,
>> +                                                     lookup_format(test_oa_format));
>> +                                       break;
>> +                               }
>> +                       }
>> +
>> +                       igt_debug("n samples written = %ld/%lu (%ix%i)\n",
>> +                                 accumulator.deltas[2 + 21],/* skip timestamp + clock cycles */
>> +                                 accumulator.deltas[2 + 26],
>> +                                 width, height);
>> +                       accumulator_print(&accumulator, "filtered");
>> +
>> +                       ret = drm_intel_bo_map(src[0].bo, false /* write enable */);
>> +                       igt_assert_eq(ret, 0);
>> +                       ret = drm_intel_bo_map(dst[0].bo, false /* write enable */);
>> +                       igt_assert_eq(ret, 0);
>> +
>> +                       ret = memcmp(src[0].bo->virtual, dst[0].bo->virtual, 4 * width * height);
>> +                       if (ret != 0) {
>> +                               accumulator_print(&accumulator, "total");
>> +                               /* This needs to be investigated... From time
>> +                                * to time, the work we kick off doesn't seem
>> +                                * to happen. WTH?? */
>> +                               exit(EAGAIN);
>> +                       }
>> +                       //igt_assert_eq(ret, 0);
>> +
>> +                       drm_intel_bo_unmap(src[0].bo);
>> +                       drm_intel_bo_unmap(dst[0].bo);
>> +
>> +                       igt_assert_eq(accumulator.deltas[2 + 26], width * height);
>> +
>> +                       for (int i = 0; i < ARRAY_SIZE(src); i++) {
>> +                               drm_intel_bo_unreference(src[i].bo);
>> +                               drm_intel_bo_unreference(dst[i].bo);
>> +                       }
>> +
>> +                       drm_intel_bo_unmap(bo);
>> +                       drm_intel_bo_unreference(bo);
>> +                       intel_batchbuffer_free(batch);
>> +                       drm_intel_gem_context_destroy(context0);
>> +                       drm_intel_gem_context_destroy(context1);
>> +                       drm_intel_bufmgr_destroy(bufmgr);
>> +                       __perf_close(stream_fd);
>> +               }
>> +
>> +               child_ret = igt_wait_helper(&child);
>> +
>> +               igt_assert(WEXITSTATUS(child_ret) == EAGAIN ||
>> +                          WEXITSTATUS(child_ret) == 0);
>> +
>> +       } while (WEXITSTATUS(child_ret) == EAGAIN);
>> +}
>> +
>>   static void
>>   test_rc6_disable(void)
>>   {
>> @@ -3264,8 +3987,10 @@ igt_main
>>                  test_oa_exponents(550);
>>          }
>>
>> -       igt_subtest("per-context-mode-unprivileged")
>> +       igt_subtest("per-context-mode-unprivileged") {
>> +               igt_require(IS_HASWELL(devid));
>>                  test_per_context_mode_unprivileged();
>> +       }
>>
>>          igt_subtest("buffer-fill")
>>                  test_buffer_fill();
>> @@ -3290,15 +4015,20 @@ igt_main
>>          igt_subtest("mi-rpc")
>>                  test_mi_rpc();
>>
>> -       igt_subtest("unprivileged-singled-ctx-counters") {
>> +       igt_subtest("unprivileged-single-ctx-counters") {
>> +               igt_require(IS_HASWELL(devid));
>> +               hsw_test_single_ctx_counters();
>> +       }
>> +
>> +       igt_subtest("gen8-unprivileged-single-ctx-counters") {
> Did you mean -privileged-single-ctx-counters, we require root to
> configure the OA unit for gen8+, no?

You can avoid root by running :

sudo sysctl dev.i915.perf_stream_paranoid=0

>
>>                  /* For Gen8+ the OA unit can no longer be made to clock gate
>>                   * for a specific context. Additionally the partial-replacement
>>                   * functionality to HW filter timer reports for a specific
>>                   * context (SKL+) can't stop multiple applications viewing
>>                   * system-wide data via MI_REPORT_PERF_COUNT commands.
>>                   */
>> -               igt_require(IS_HASWELL(devid));
>> -               hsw_test_single_ctx_counters();
>> +               igt_require(intel_gen(devid) >= 8);
>> +               gen8_test_single_ctx_render_target_writes_a_counter();
>>          }
>>
>>          igt_subtest("rc6-disable")
>> --
>> 2.13.3
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/intel-gfx