[Mesa-dev] [RFC 6/6] i965: Adds further support for "3D" OA counters

Robert Bragg robert at sixbynine.org
Tue May 5 17:53:54 PDT 2015


This uses the i915_oa '3D' metric set to expose many more interesting OA
counters including information about depth, alpha and stencil testing,
sampler usage/bottlneck stats and cache throughputs.

Signed-off-by: Robert Bragg <robert at sixbynine.org>
---
 src/mesa/drivers/dri/i965/brw_performance_query.c | 402 +++++++++++++++++++++-
 1 file changed, 401 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index bfe39f9..db915cd 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -1313,7 +1313,7 @@ brw_delete_perf_query(struct gl_context *ctx,
 
 /******************************************************************************/
 
-/* Type safe wrapper for reading OA counter values */
+/* Type safe wrappers for reading OA counter values */
 
 static uint64_t
 read_uint64_oa_counter(struct brw_oa_counter *counter, uint64_t *accumulated)
@@ -1327,6 +1327,18 @@ read_uint64_oa_counter(struct brw_oa_counter *counter, uint64_t *accumulated)
    return value;
 }
 
+static float
+read_float_oa_counter(struct brw_oa_counter *counter, uint64_t *accumulated)
+{
+   float value;
+
+   assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL);
+
+   counter->read(counter, accumulated, &value);
+
+   return value;
+}
+
 /******************************************************************************/
 
 /*
@@ -1467,6 +1479,71 @@ add_oa_counter_normalised_by_gpu_duration(struct brw_query_builder *builder,
 }
 
 static void
+read_hsw_samplers_busy_duration_cb(struct brw_oa_counter *counter,
+                                   uint64_t *accumulated,
+                                   void *value) /* float */
+{
+   uint64_t sampler0_busy = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t sampler1_busy = read_uint64_oa_counter(counter->reference1, accumulated);
+   uint64_t clk_delta = read_uint64_oa_counter(counter->reference2, accumulated);
+   float *ret = value;
+
+   if (!clk_delta) {
+      *ret = 0;
+      return;
+   }
+
+   *ret = ((double)(sampler0_busy + sampler1_busy) * 100.0) / ((double)clk_delta * 2.0);
+}
+
+static struct brw_oa_counter *
+add_hsw_samplers_busy_duration_oa_counter(struct brw_query_builder *builder,
+                                          struct brw_oa_counter *sampler0_busy_raw,
+                                          struct brw_oa_counter *sampler1_busy_raw)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = sampler0_busy_raw;
+   counter->reference1 = sampler1_busy_raw;
+   counter->reference2 = builder->gpu_core_clock;
+   counter->read = read_hsw_samplers_busy_duration_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+   return counter;
+}
+
+static void
+read_hsw_slice_extrapolated_cb(struct brw_oa_counter *counter,
+                               uint64_t *accumulated,
+                               void *value) /* float */
+{
+   uint64_t counter0 = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t counter1 = read_uint64_oa_counter(counter->reference1, accumulated);
+   int eu_count = counter->config;
+   uint64_t *ret = value;
+
+   *ret = (counter0 + counter1) * eu_count;
+}
+
+static struct brw_oa_counter *
+add_hsw_slice_extrapolated_oa_counter(struct brw_query_builder *builder,
+                                      struct brw_oa_counter *counter0,
+                                      struct brw_oa_counter *counter1)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = counter0;
+   counter->reference1 = counter1;
+   counter->config = builder->brw->perfquery.eu_count;
+   counter->read = read_hsw_slice_extrapolated_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+   return counter;
+}
+
+static void
 read_oa_counter_normalized_by_eu_duration_cb(struct brw_oa_counter *counter,
                                              uint64_t *accumulated,
                                              void *value) /* float */
@@ -1535,6 +1612,63 @@ add_average_thread_cycles_oa_counter(struct brw_query_builder *builder,
 }
 
 static void
+read_scaled_uint64_counter_cb(struct brw_oa_counter *counter,
+                              uint64_t *accumulated,
+                              void *value) /* uint64 */
+{
+   uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t scale = counter->config;
+   uint64_t *ret = value;
+
+   *ret = delta * scale;
+}
+
+static struct brw_oa_counter *
+add_scaled_uint64_oa_counter(struct brw_query_builder *builder,
+                             struct brw_oa_counter *input,
+                             int scale)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = input;
+   counter->config = scale;
+   counter->read = read_scaled_uint64_counter_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+   return counter;
+}
+
+static void
+read_max_of_float_counters_cb(struct brw_oa_counter *counter,
+                              uint64_t *accumulated,
+                              void *value) /* float */
+{
+   float counter0 = read_float_oa_counter(counter->reference0, accumulated);
+   float counter1 = read_float_oa_counter(counter->reference1, accumulated);
+   float *ret = value;
+
+   *ret = counter0 >= counter1 ? counter0 : counter1;
+}
+
+
+static struct brw_oa_counter *
+add_max_of_float_oa_counters(struct brw_query_builder *builder,
+                             struct brw_oa_counter *counter0,
+                             struct brw_oa_counter *counter1)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = counter0;
+   counter->reference1 = counter1;
+   counter->read = read_max_of_float_counters_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+   return counter;
+}
+
+static void
 report_uint64_oa_counter_as_raw_uint64(struct brw_query_builder *builder,
                                        const char *name,
                                        const char *desc,
@@ -1597,6 +1731,26 @@ report_float_oa_counter_as_percentage_duration(struct brw_query_builder *builder
 }
 
 static void
+report_uint64_oa_counter_as_throughput(struct brw_query_builder *builder,
+                                       const char *name,
+                                       const char *desc,
+                                       struct brw_oa_counter *oa_counter)
+{
+   struct brw_perf_query_counter *counter =
+      &builder->query->counters[builder->query->n_counters++];
+
+   counter->oa_counter = oa_counter;
+   counter->name = name;
+   counter->desc = desc;
+   counter->type = GL_PERFQUERY_COUNTER_THROUGHPUT_INTEL;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+   counter->offset = pot_align(builder->offset, 8);
+   counter->size = sizeof(uint64_t);
+
+   builder->offset = counter->offset + counter->size;
+}
+
+static void
 report_uint64_oa_counter_as_duration(struct brw_query_builder *builder,
                                      const char *name,
                                      const char *desc,
@@ -1806,6 +1960,251 @@ hsw_add_basic_oa_counter_query(struct brw_context *brw)
    query->data_size = last->offset + last->size;
 }
 
+static void
+hsw_add_3d_oa_counter_query(struct brw_context *brw)
+{
+   struct brw_query_builder builder;
+   struct brw_perf_query *query =
+      &brw->perfquery.queries[brw->perfquery.n_queries++];
+   int a_offset;
+   int b_offset;
+   int c_offset;
+   struct brw_oa_counter *elapsed;
+   struct brw_oa_counter *raw;
+   struct brw_oa_counter *c;
+   struct brw_oa_counter *sampler0_busy_raw;
+   struct brw_oa_counter *sampler1_busy_raw;
+   struct brw_oa_counter *sampler0_bottleneck;
+   struct brw_oa_counter *sampler1_bottleneck;
+   struct brw_oa_counter *sampler0_texels;
+   struct brw_oa_counter *sampler1_texels;
+   struct brw_oa_counter *sampler0_l1_misses;
+   struct brw_oa_counter *sampler1_l1_misses;
+   struct brw_oa_counter *sampler_l1_misses;
+   struct brw_perf_query_counter *last;
+
+   query->kind = OA_COUNTERS;
+   query->name = "Gen7 3D Observability Architecture Counters";
+   query->counters = rzalloc_array(brw, struct brw_perf_query_counter,
+                                   MAX_PERF_QUERY_COUNTERS);
+   query->n_counters = 0;
+   query->oa_counters = rzalloc_array(brw, struct brw_oa_counter,
+                                      MAX_OA_QUERY_COUNTERS);
+   query->n_oa_counters = 0;
+   query->oa_metrics_set = I915_OA_METRICS_SET_3D;
+   query->oa_format = I915_OA_FORMAT_A45_B8_C8_HSW;
+
+   builder.brw = brw;
+   builder.query = query;
+   builder.offset = 0;
+   builder.next_accumulator_index = 0;
+
+   /* A counters offset = 12  bytes / 0x0c (45 A counters)
+    * B counters offset = 192 bytes / 0xc0 (8  B counters)
+    * C counters offset = 224 bytes / 0xe0 (8  C counters)
+    *
+    * Note: we index into the snapshots/reports as arrays of uint32 values
+    * relative to the A/B/C offset since different report layouts can vary how
+    * many A/B/C counters but with relative addressing it should be possible to
+    * re-use code for describing the counters available with different report
+    * layouts.
+    */
+
+   builder.a_offset = a_offset = 3;
+   builder.b_offset = b_offset = a_offset + 45;
+   builder.c_offset = c_offset = b_offset + 8;
+
+   /* Can be referenced by other counters... */
+   builder.gpu_core_clock = add_raw_oa_counter(&builder, c_offset + 2);
+
+   elapsed = add_hsw_elapsed_oa_counter(&builder);
+   report_uint64_oa_counter_as_duration(&builder,
+                                        "GPU Time Elapsed",
+                                        "Time elapsed on the GPU during the measurement.",
+                                        elapsed);
+
+   c = add_avg_frequency_oa_counter(&builder, elapsed);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "AVG GPU Core Frequency",
+                                            "Average GPU Core Frequency in the measurement.",
+                                            c);
+
+   add_aggregate_counters(&builder);
+
+   raw = add_raw_oa_counter(&builder, a_offset + 35);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Early Depth Test Fails",
+                                            "The total number of pixels dropped on early depth test.",
+                                            raw);
+   /* XXX: caveat: it's 2x real No. when PS has 2 output colors */
+   raw = add_raw_oa_counter(&builder, a_offset + 36);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Samples Killed in PS",
+                                            "The total number of samples or pixels dropped in pixel shaders.",
+                                            raw);
+   raw = add_raw_oa_counter(&builder, a_offset + 37);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Alpha Test Fails",
+                                            "The total number of pixels dropped on post-PS alpha test.",
+                                            raw);
+   raw = add_raw_oa_counter(&builder, a_offset + 38);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Late Stencil Test Fails",
+                                            "The total number of pixels dropped on post-PS stencil test.",
+                                            raw);
+   raw = add_raw_oa_counter(&builder, a_offset + 39);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Late Depth Test Fails",
+                                            "The total number of pixels dropped on post-PS depth test.",
+                                            raw);
+   raw = add_raw_oa_counter(&builder, a_offset + 40);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Samples Written",
+                                            "The total number of samples or pixels written to all render targets.",
+                                            raw);
+
+   raw = add_raw_oa_counter(&builder, c_offset + 5);
+   /* I.e. assuming even work distribution across threads... */
+   c = add_scaled_uint64_oa_counter(&builder, raw, brw->perfquery.eu_count * 4);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Samples Blended",
+                                            "The total number of blended samples or pixels written to all render targets.",
+                                            c);
+
+   /* XXX: XML implies explicit sub-slice availability check, but
+    * not sure how to determine this a.t.m so maybe we could just
+    * check for GT2/3 which have an even number of slices? */
+   sampler0_busy_raw = add_raw_oa_counter(&builder, b_offset + 0);
+   c = add_oa_counter_normalised_by_gpu_duration(&builder, sampler0_busy_raw);
+   report_float_oa_counter_as_percentage_duration(&builder,
+                                                   "Sampler 0 Busy",
+                                                   "The percentage of time in which sampler 0 was busy.",
+                                                   c);
+   /* XXX: XML implies explicit sub-slice availability check, but
+    * not sure how to determine this a.t.m so maybe we could just
+    * check for GT2/3 which have an even number of slices? */
+   sampler1_busy_raw = add_raw_oa_counter(&builder, b_offset + 1);
+   c = add_oa_counter_normalised_by_gpu_duration(&builder, sampler1_busy_raw);
+   report_float_oa_counter_as_percentage_duration(&builder,
+                                                   "Sampler 1 Busy",
+                                                   "The percentage of time in which sampler 1 was busy.",
+                                                   c);
+
+   c = add_hsw_samplers_busy_duration_oa_counter(&builder,
+                                                 sampler0_busy_raw,
+                                                 sampler1_busy_raw);
+   report_float_oa_counter_as_percentage_duration(&builder,
+                                                   "Samplers Busy",
+                                                   "The percentage of time in which samplers were busy.",
+                                                   c);
+
+   raw = add_raw_oa_counter(&builder, b_offset + 2);
+   sampler0_bottleneck = add_oa_counter_normalised_by_gpu_duration(&builder, raw);
+   report_float_oa_counter_as_percentage_duration(&builder,
+                                                   "Sampler 0 Bottleneck",
+                                                   "The percentage of time in which sampler 0 was a bottleneck.",
+                                                   sampler0_bottleneck);
+   raw = add_raw_oa_counter(&builder, b_offset + 3);
+   sampler1_bottleneck = add_oa_counter_normalised_by_gpu_duration(&builder, raw);
+   report_float_oa_counter_as_percentage_duration(&builder,
+                                                   "Sampler 1 Bottleneck",
+                                                   "The percentage of time in which sampler 1 was a bottleneck.",
+                                                   sampler1_bottleneck);
+
+   c = add_max_of_float_oa_counters(&builder, sampler0_bottleneck, sampler1_bottleneck);
+   report_float_oa_counter_as_percentage_duration(&builder,
+                                                   "Sampler Bottleneck",
+                                                   "The percentage of time in which samplers were bottlenecks.",
+                                                   c);
+   raw = add_raw_oa_counter(&builder, b_offset + 4);
+   sampler0_texels = add_scaled_uint64_oa_counter(&builder, raw, 4);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Sampler 0 Texels LOD0",
+                                            "The total number of texels lookups in LOD0 in sampler 0 unit.",
+                                            sampler0_texels);
+   raw = add_raw_oa_counter(&builder, b_offset + 5);
+   sampler1_texels = add_scaled_uint64_oa_counter(&builder, raw, 4);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Sampler 1 Texels LOD0",
+                                            "The total number of texels lookups in LOD0 in sampler 1 unit.",
+                                            sampler1_texels);
+
+   c = add_hsw_slice_extrapolated_oa_counter(&builder, sampler0_texels, sampler1_texels);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Sampler Texels LOD0",
+                                            "The total number of texels lookups in LOD0 in all sampler units.",
+                                            c);
+
+   raw = add_raw_oa_counter(&builder, b_offset + 6);
+   sampler0_l1_misses = add_scaled_uint64_oa_counter(&builder, raw, 2);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Sampler 0 Cache Misses",
+                                            "The total number of misses in L1 sampler caches.",
+                                            sampler0_l1_misses);
+   raw = add_raw_oa_counter(&builder, b_offset + 7);
+   sampler1_l1_misses = add_scaled_uint64_oa_counter(&builder, raw, 2);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Sampler 1 Cache Misses",
+                                            "The total number of misses in L1 sampler caches.",
+                                            sampler1_l1_misses);
+   sampler_l1_misses = add_hsw_slice_extrapolated_oa_counter(&builder, sampler0_l1_misses, sampler1_l1_misses);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "Sampler Cache Misses",
+                                            "The total number of misses in L1 sampler caches.",
+                                            sampler_l1_misses);
+
+   c = add_scaled_uint64_oa_counter(&builder, sampler_l1_misses, 64);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "L3 Sampler Throughput",
+                                          "The total number of GPU memory bytes transferred between samplers and L3 caches.",
+                                          c);
+
+   raw = add_raw_oa_counter(&builder, c_offset + 1);
+   c = add_scaled_uint64_oa_counter(&builder, raw, 64);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "GTI Fixed Pipe Throughput",
+                                          "The total number of GPU memory bytes transferred between Fixed Pipeline (Command Dispatch, Input Assembly and Stream Output) and GTI.",
+                                          c);
+
+   raw = add_raw_oa_counter(&builder, c_offset + 0);
+   c = add_scaled_uint64_oa_counter(&builder, raw, 64);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "GTI Depth Throughput",
+                                          "The total number of GPU memory bytes transferred between depth caches and GTI.",
+                                          c);
+   raw = add_raw_oa_counter(&builder, c_offset + 3);
+   c = add_scaled_uint64_oa_counter(&builder, raw, 64);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "GTI RCC Throughput",
+                                          "The total number of GPU memory bytes transferred between render color caches and GTI.",
+                                          c);
+   raw = add_raw_oa_counter(&builder, c_offset + 4);
+   c = add_scaled_uint64_oa_counter(&builder, raw, 64);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "GTI L3 Throughput",
+                                          "The total number of GPU memory bytes transferred between L3 caches and GTI.",
+                                          c);
+   raw = add_raw_oa_counter(&builder, c_offset + 6);
+   c = add_scaled_uint64_oa_counter(&builder, raw, 128);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "GTI Read Throughput",
+                                          "The total number of GPU memory bytes read from GTI.",
+                                          c);
+   raw = add_raw_oa_counter(&builder, c_offset + 7);
+   c = add_scaled_uint64_oa_counter(&builder, raw, 64);
+   report_uint64_oa_counter_as_throughput(&builder,
+                                          "GTI Write Throughput",
+                                          "The total number of GPU memory bytes written to GTI.",
+                                          c);
+
+   assert(query->n_counters < MAX_PERF_QUERY_COUNTERS);
+   assert(query->n_oa_counters < MAX_OA_QUERY_COUNTERS);
+
+   last = &query->counters[query->n_counters - 1];
+   query->data_size = last->offset + last->size;
+}
+
+
 #define SCALED_NAMED_STAT(REG, NUM, DEN, NAME, DESC)        \
    {                                                        \
       .name = NAME,                                         \
@@ -1937,6 +2336,7 @@ brw_init_performance_queries(struct brw_context *brw)
       if (brw->is_haswell) {
          brw->perfquery.read_oa_report_timestamp = hsw_read_report_timestamp;
          hsw_add_basic_oa_counter_query(brw);
+         hsw_add_3d_oa_counter_query(brw);
       }
    }
 
-- 
2.3.2



More information about the mesa-dev mailing list