[Mesa-dev] [RFC 5/6] i965: Expose OA counters via INTEL_performance_query

Robert Bragg robert at sixbynine.org
Tue May 5 17:53:53 PDT 2015


This adds support for exposing basic ('aggregate') Observation
Architecture performance counters on Haswell.

This support is based on the i915_oa perf kernel interface which is used
to configure the OA unit, allowing Mesa to emit MI_REPORT_PERF_COUNT
commands around queries to collect counter snapshots.

To take into account the small chance that some of the 32bit counters
could wrap around for long queries (~50 milliseconds for a GT3 Haswell @
1.1GHz) the implementation also collects periodic metrics via perf.

It seems worth noting that the structures chosen to describe the
counters were arrived at after a few iterations of creating other
limited intel_gpu_tools programs that similarly needed to describe,
normalise and display these counters and many of the utility functions
should be reusable as more sets of counters are exposed via the OA unit
as well as for future hardware generations.

It also seems worth noting that in the future we might want to aim to
factor out much of the code for describing and normalising OA counters
into a common library such as libdrm since we are already having
difficulties copying very similar code between multiple tools dealing
with these same counters; such as for looking at the OA counters for
system wide analysis. Until we've got a bit more experience dealing with
the counters though it seems best to defer this separation and avoid
having to design a suitable api for these different use cases.

Signed-off-by: Robert Bragg <robert at sixbynine.org>
---
 src/mesa/drivers/dri/i965/brw_context.h           |   66 +
 src/mesa/drivers/dri/i965/brw_performance_query.c | 1383 ++++++++++++++++++++-
 2 files changed, 1430 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2cd963d..5f5a59b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -954,6 +954,7 @@ struct brw_stage_state
 };
 
 enum brw_query_kind {
+   OA_COUNTERS,
    PIPELINE_STATS
 };
 
@@ -964,10 +965,18 @@ struct brw_perf_query
    struct brw_perf_query_counter *counters;
    int n_counters;
    size_t data_size;
+
+   /* OA specific */
+   int oa_metrics_set;
+   int oa_format;
+   struct brw_oa_counter *oa_counters;
+   int n_oa_counters;
 };
 
 #define MAX_PERF_QUERIES 3
 #define MAX_PERF_QUERY_COUNTERS 150
+#define MAX_OA_QUERY_COUNTERS 100
+#define MAX_RAW_OA_COUNTERS 62
 
 /**
  * brw_context is derived from gl_context.
@@ -1399,7 +1408,64 @@ struct brw_context
       struct brw_perf_query queries[MAX_PERF_QUERIES];
       int n_queries;
 
+      /* A common OA counter that we want to read directly in several places */
+      uint64_t (*read_oa_report_timestamp)(uint32_t *report);
+
+      /* Needed to normalize counters aggregated across all EUs */
+      int eu_count;
+
+      /* The i915_oa perf event we open to setup + enable the OA counters */
+      int perf_oa_event_fd;
+
+      /* An i915_oa perf event fd gives exclusive access to the OA unit that
+       * will report counter snapshots for a specific counter set/profile in a
+       * specific layout/format so we can only start OA queries that are
+       * compatible with the currently open fd... */
+      int perf_oa_metrics_set;
+      int perf_oa_format;
+
+      /* The mmaped circular buffer for collecting samples from perf */
+      uint8_t *perf_oa_mmap_base;
+      size_t perf_oa_buffer_size;
+      struct perf_event_mmap_page *perf_oa_mmap_page;
+
+      /* The system's page size */
+      unsigned int page_size;
+
+      int n_active_oa_queries;
       int n_active_pipeline_stats_queries;
+
+      /* The number of queries depending on running OA counters which
+       * extends beyond brw_end_perf_query() since we need to wait until
+       * the last MI_RPC command has been written.
+       *
+       * Accurate accounting is important here as emitting an
+       * MI_REPORT_PERF_COUNT command while the OA unit is disabled will
+       * effectively hang the gpu.
+       */
+      int n_oa_users;
+
+      /* To help catch an spurious problem with the hardware or perf
+       * forwarding samples, we emit each MI_REPORT_PERF_COUNT command
+       * with a unique ID that we can explicitly check for... */
+      int next_query_start_report_id;
+
+      /**
+       * An array of queries whose results haven't yet been assembled based on
+       * the data in buffer objects.
+       *
+       * These may be active, or have already ended.  However, the results
+       * have not been requested.
+       */
+      struct brw_perf_query_object **unresolved;
+      int unresolved_elements;
+      int unresolved_array_size;
+
+      /* The total number of query objects so we can relinquish
+       * our exclusive access to perf if the application deletes
+       * all of its objects. (NB: We only disable perf while
+       * there are no active queries) */
+      int n_query_instances;
    } perfquery;
 
    int num_atoms[BRW_NUM_PIPELINES];
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index 38447e8..bfe39f9 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -26,9 +26,17 @@
  *
  * Implementation of the GL_INTEL_performance_query extension.
  *
- * Currently this driver only exposes the 64bit Pipeline Statistics Registers
- * available with Gen6 and Gen7.5, with support for Observability Counters
- * to be added later for Gen7.5+
+ * Currently there are two possible counter sources exposed here:
+ *
+ * On Gen6+ hardware we have numerous 64bit Pipeline Statistics Registers
+ * that we can snapshot at the beginning and end of a query.
+ *
+ * On Gen7.5+ we have Observability Architecture counters which are
+ * covered in separate document from the rest of the PRMs.  It is available at:
+ * https://01.org/linuxgraphics/documentation/driver-documentation-prms
+ * => 2013 Intel Core Processor Family => Observability Performance Counters
+ * (This one volume covers Sandybridge, Ivybridge, Baytrail, and Haswell,
+ * though notably we currently only support OA counters for Haswell+)
  */
 
 #include <linux/perf_event.h>
@@ -56,6 +64,41 @@
 
 #define FILE_DEBUG_FLAG DEBUG_PERFMON
 
+/* Describes how to read one OA counter which might be a raw counter read
+ * directly from a counter snapshot or could be a higher level counter derived
+ * from one or more raw counters.
+ *
+ * Raw counters will have set ->report_offset to the snapshot offset and have
+ * an accumulator that can consider counter overflow according to the width of
+ * that counter.
+ *
+ * Higher level counters can currently reference up to 3 other counters + use
+ * ->config for anything. They don't need an accumulator.
+ *
+ * The data type that will be written to *value_out by the read function can
+ * be determined by ->data_type
+ */
+struct brw_oa_counter
+{
+   struct brw_oa_counter *reference0;
+   struct brw_oa_counter *reference1;
+   struct brw_oa_counter *reference2;
+   union {
+      int report_offset;
+      int config;
+   };
+
+   int accumulator_index;
+   void (*accumulate)(struct brw_oa_counter *counter,
+                      uint32_t *start,
+                      uint32_t *end,
+                      uint64_t *accumulator);
+   GLenum data_type;
+   void (*read)(struct brw_oa_counter *counter,
+                uint64_t *accumulated,
+                void *value_out);
+};
+
 struct brw_pipeline_stat
 {
    uint32_t reg;
@@ -74,7 +117,24 @@ struct brw_perf_query_counter
    size_t offset;
    size_t size;
 
-   struct brw_pipeline_stat pipeline_stat;
+   union {
+      struct brw_oa_counter *oa_counter;
+      struct brw_pipeline_stat pipeline_stat;
+   };
+};
+
+struct brw_query_builder
+{
+   struct brw_context *brw;
+   struct brw_perf_query *query;
+   size_t offset;
+   int next_accumulator_index;
+
+   int a_offset;
+   int b_offset;
+   int c_offset;
+
+   struct brw_oa_counter *gpu_core_clock;
 };
 
 /**
@@ -91,20 +151,86 @@ struct brw_perf_query_object
 
    const struct brw_perf_query *query;
 
-   struct {
-      /**
-       * BO containing starting and ending snapshots for the
-       * statistics counters.
-       */
-      drm_intel_bo *bo;
-
-      /**
-       * Storage for final pipeline statistics counter results.
-       */
-      uint64_t *results;
+   /* See query->kind to know which state below is in use... */
+   union {
+      struct {
+
+         /**
+          * BO containing OA counter snapshots at query Begin/End time.
+          */
+         drm_intel_bo *bo;
+         int current_report_id;
+
+         /**
+          * We collect periodic counter snapshots via perf so we can account
+          * for counter overflow and this is a pointer into the circular
+          * perf buffer for collecting snapshots that lie within the begin-end
+          * bounds of this query.
+          */
+         unsigned int perf_tail;
+
+         /**
+          * Storage the final accumulated OA counters.
+          */
+         uint64_t accumulator[MAX_RAW_OA_COUNTERS];
+
+         /**
+          * false while in the unresolved_elements list, and set to true when
+          * the final, end MI_RPC snapshot has been accumulated.
+          */
+         bool results_accumulated;
+
+      } oa;
+
+      struct {
+         /**
+          * BO containing starting and ending snapshots for the
+          * statistics counters.
+          */
+         drm_intel_bo *bo;
+
+         /**
+          * Storage for final pipeline statistics counter results.
+          */
+         uint64_t *results;
+
+      } pipeline_stats;
+   };
+};
 
-   } pipeline_stats;
+/* Samples read from the perf circular buffer */
+struct oa_perf_sample {
+   struct perf_event_header header;
+   uint32_t raw_size;
+   uint8_t raw_data[];
 };
+#define MAX_OA_PERF_SAMPLE_SIZE (8 +   /* perf_event_header */       \
+                                 4 +   /* raw_size */                \
+                                 256 + /* raw OA counter snapshot */ \
+                                 4)    /* alignment padding */
+
+#define TAKEN(HEAD, TAIL, POT_SIZE)	(((HEAD) - (TAIL)) & (POT_SIZE - 1))
+
+/* Note: this will equate to 0 when the buffer is exactly full... */
+#define REMAINING(HEAD, TAIL, POT_SIZE) (POT_SIZE - TAKEN (HEAD, TAIL, POT_SIZE))
+
+#if defined(__i386__)
+#define rmb()           __asm__ volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define mb()            __asm__ volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#endif
+
+#if defined(__x86_64__)
+#define rmb()           __asm__ volatile("lfence" ::: "memory")
+#define mb()            __asm__ volatile("mfence" ::: "memory")
+#endif
+
+/* Allow building for a more recent kernel than the system headers
+ * correspond too... */
+#ifndef PERF_EVENT_IOC_FLUSH
+#include <linux/ioctl.h>
+#define PERF_EVENT_IOC_FLUSH                 _IO ('$', 8)
+#define PERF_RECORD_DEVICE                   11
+#endif
 
 /** Downcasting convenience macro. */
 static inline struct brw_perf_query_object *
@@ -115,6 +241,12 @@ brw_perf_query(struct gl_perf_query_object *o)
 
 #define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
 
+static inline size_t
+pot_align(size_t base, int pot_alignment)
+{
+    return (base + pot_alignment - 1) & ~(pot_alignment - 1);
+}
+
 /******************************************************************************/
 
 static GLboolean brw_is_perf_query_ready(struct gl_context *,
@@ -123,10 +255,20 @@ static GLboolean brw_is_perf_query_ready(struct gl_context *,
 static void
 dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
 {
+   struct gl_context *ctx = brw_void;
    struct gl_perf_query_object *o = query_void;
    struct brw_perf_query_object *obj = query_void;
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+      DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n",
+          id,
+          o->Used ? "Dirty," : "New,",
+          o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
+          obj->oa.bo ? "yes," : "no,",
+          brw_is_perf_query_ready(ctx, o) ? "ready," : "not ready,",
+          obj->oa.results_accumulated ? "accumulated" : "not accumulated");
+      break;
    case PIPELINE_STATS:
       DBG("%4d: %-6s %-8s BO: %-4s\n",
           id,
@@ -141,8 +283,8 @@ void
 brw_dump_perf_queries(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   DBG("Queries: (Open queries = %d)\n",
-       brw->perfquery.n_active_pipeline_stats_queries);
+   DBG("Queries: (Open queries = %d, OA users = %d)\n",
+       brw->perfquery.n_active_oa_queries, brw->perfquery.n_oa_users);
    _mesa_HashWalk(ctx->PerfQuery.Objects, dump_perf_query_callback, brw);
 }
 
@@ -164,6 +306,10 @@ brw_get_perf_query_info(struct gl_context *ctx,
    *n_counters = query->n_counters;
 
    switch(query->kind) {
+   case OA_COUNTERS:
+      *n_active = brw->perfquery.n_active_oa_queries;
+      break;
+
    case PIPELINE_STATS:
       *n_active = brw->perfquery.n_active_pipeline_stats_queries;
       break;
@@ -263,6 +409,502 @@ gather_statistics_results(struct brw_context *brw,
 /******************************************************************************/
 
 /**
+ * Emit an MI_REPORT_PERF_COUNT command packet.
+ *
+ * This writes the current OA counter values to buffer.
+ */
+static void
+emit_mi_report_perf_count(struct brw_context *brw,
+                          drm_intel_bo *bo,
+                          uint32_t offset_in_bytes,
+                          uint32_t report_id)
+{
+   assert(offset_in_bytes % 64 == 0);
+
+   /* Reports apparently don't always get written unless we flush first. */
+   intel_batchbuffer_emit_mi_flush(brw);
+
+   if (brw->gen == 7) {
+      BEGIN_BATCH(3);
+      OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
+      OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                offset_in_bytes);
+      OUT_BATCH(report_id);
+      ADVANCE_BATCH();
+   } else
+      unreachable("Unsupported generation for OA performance counters.");
+
+   /* Reports apparently don't always get written unless we flush after. */
+   intel_batchbuffer_emit_mi_flush(brw);
+}
+
+static unsigned int
+read_perf_head(struct perf_event_mmap_page *mmap_page)
+{
+   unsigned int head = (*(volatile uint64_t *)&mmap_page->data_head);
+   rmb();
+
+   return head;
+}
+
+static void
+write_perf_tail(struct perf_event_mmap_page *mmap_page,
+                unsigned int tail)
+{
+   /* Make sure we've finished reading all the sample data we
+    * we're consuming before updating the tail... */
+   mb();
+   mmap_page->data_tail = tail;
+}
+
+/* Update the real perf tail pointer according to the query tail that
+ * is currently furthest behind...
+ */
+static void
+update_perf_tail(struct brw_context *brw)
+{
+   unsigned int size = brw->perfquery.perf_oa_buffer_size;
+   unsigned int head = read_perf_head(brw->perfquery.perf_oa_mmap_page);
+   int straggler_taken = -1;
+   unsigned int straggler_tail;
+
+   for (int i = 0; i < brw->perfquery.unresolved_elements; i++) {
+      struct brw_perf_query_object *obj = brw->perfquery.unresolved[i];
+      int taken;
+
+      if (!obj->oa.bo)
+         continue;
+
+      taken = TAKEN(head, obj->oa.perf_tail, size);
+
+      if (taken > straggler_taken) {
+         straggler_taken = taken;
+         straggler_tail = obj->oa.perf_tail;
+      }
+   }
+
+   if (straggler_taken >= 0)
+      write_perf_tail(brw->perfquery.perf_oa_mmap_page, straggler_tail);
+}
+
+/**
+ * Add a query to the global list of "unresolved queries."
+ *
+ * Queries are "unresolved" until all the counter snapshots have been
+ * accumulated via accumulate_oa_snapshots() after the end MI_REPORT_PERF_COUNT
+ * has landed in query->oa.bo.
+ */
+static void
+add_to_unresolved_query_list(struct brw_context *brw,
+                             struct brw_perf_query_object *obj)
+{
+   if (brw->perfquery.unresolved_elements >=
+       brw->perfquery.unresolved_array_size) {
+      brw->perfquery.unresolved_array_size *= 1.5;
+      brw->perfquery.unresolved = reralloc(brw, brw->perfquery.unresolved,
+                                           struct brw_perf_query_object *,
+                                           brw->perfquery.unresolved_array_size);
+   }
+
+   brw->perfquery.unresolved[brw->perfquery.unresolved_elements++] = obj;
+
+   if (obj->oa.bo)
+      update_perf_tail(brw);
+}
+
+/**
+ * Remove a query from the global list of "unresolved queries." once
+ * the end MI_RPC OA counter snapshot has been accumulated, or when
+ * discarding unwanted query results.
+ */
+static void
+drop_from_unresolved_query_list(struct brw_context *brw,
+                                struct brw_perf_query_object *obj)
+{
+   for (int i = 0; i < brw->perfquery.unresolved_elements; i++) {
+      if (brw->perfquery.unresolved[i] == obj) {
+         int last_elt = --brw->perfquery.unresolved_elements;
+
+         if (i == last_elt)
+            brw->perfquery.unresolved[i] = NULL;
+         else
+            brw->perfquery.unresolved[i] = brw->perfquery.unresolved[last_elt];
+
+         break;
+      }
+   }
+
+   if (obj->oa.bo)
+      update_perf_tail(brw);
+}
+
+static int
+get_eu_count(struct brw_context *brw)
+{
+   const struct brw_device_info *info = brw->intelScreen->devinfo;
+
+   assert(info && info->is_haswell);
+
+   if (info->gt == 1)
+      return 10;
+   else if (info->gt == 2)
+      return 20;
+   else if (info->gt == 3)
+      return 40;
+
+   unreachable("Unexpected Haswell GT number");
+}
+
+static uint64_t
+read_report_timestamp(struct brw_context *brw, uint32_t *report)
+{
+   return brw->perfquery.read_oa_report_timestamp(report);
+}
+
+/**
+ * Given pointers to starting and ending OA snapshots, add the deltas for each
+ * counter to the results.
+ */
+static void
+add_deltas(struct brw_context *brw,
+           struct brw_perf_query_object *obj,
+           uint32_t *start, uint32_t *end)
+{
+   const struct brw_perf_query *query = obj->query;
+
+   for (int i = 0; i < query->n_oa_counters; i++) {
+      struct brw_oa_counter *oa_counter = &query->oa_counters[i];
+
+      if (!oa_counter->accumulate)
+         continue;
+
+      oa_counter->accumulate(oa_counter,
+                             start, end,
+                             obj->oa.accumulator);
+   }
+}
+
+/* Handle restarting ioctl if interrupted... */
+static int
+perf_ioctl(int fd, unsigned long request, void *arg)
+{
+   int ret;
+
+   do {
+      ret = ioctl(fd, request, arg);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+   return ret;
+}
+
+static bool
+inc_n_oa_users(struct brw_context *brw)
+{
+   if (brw->perfquery.n_oa_users == 0 &&
+       perf_ioctl(brw->perfquery.perf_oa_event_fd,
+                  PERF_EVENT_IOC_ENABLE, 0) < 0)
+   {
+      return false;
+   }
+   ++brw->perfquery.n_oa_users;
+
+   return true;
+}
+
+static void
+dec_n_oa_users(struct brw_context *brw)
+{
+   /* Disabling the i915_oa event will effectively disable the OA
+    * counters.  Note it's important to be sure there are no outstanding
+    * MI_RPC commands at this point since they could stall the CS
+    * indefinitely once OACONTROL is disabled.
+    */
+   --brw->perfquery.n_oa_users;
+   if (brw->perfquery.n_oa_users == 0 &&
+       perf_ioctl(brw->perfquery.perf_oa_event_fd,
+                  PERF_EVENT_IOC_DISABLE, 0) < 0)
+   {
+      DBG("WARNING: Error disabling i915_oa perf event: %m\n");
+   }
+}
+
+/* In general if we see anything spurious while accumulating results,
+ * we don't try and continue accumulating the current query, hoping
+ * for the best, we scrap anything outstanding, and then hope for the
+ * best with new queries. */
+static void
+discard_all_queries(struct brw_context *brw)
+{
+   while (brw->perfquery.unresolved_elements) {
+      struct brw_perf_query_object *obj = brw->perfquery.unresolved[0];
+
+      obj->oa.results_accumulated = true;
+      drop_from_unresolved_query_list(brw, brw->perfquery.unresolved[0]);
+
+      dec_n_oa_users(brw);
+   }
+}
+
+/**
+ * Accumulate OA counter results from a series of snapshots.
+ *
+ * N.B. We write snapshots for the beginning and end of a query into
+ * query->oa.bo as well as collect periodic snapshots from the Linux
+ * perf interface.
+ *
+ * These periodic snapshots help to ensure we handle counter overflow
+ * correctly by being frequent enough to ensure we don't miss multiple
+ * overflows of a counter between snapshots.
+ */
+static void
+accumulate_oa_snapshots(struct brw_context *brw,
+                        struct brw_perf_query_object *obj)
+{
+   struct gl_perf_query_object *o = &obj->base;
+   uint32_t *query_buffer;
+   uint8_t *data = brw->perfquery.perf_oa_mmap_base + brw->perfquery.page_size;
+   const unsigned int size = brw->perfquery.perf_oa_buffer_size;
+   const uint64_t mask = size - 1;
+   uint64_t head;
+   uint64_t tail;
+   uint32_t *start;
+   uint64_t start_timestamp;
+   uint32_t *last;
+   uint32_t *end;
+   uint64_t end_timestamp;
+   uint8_t scratch[MAX_OA_PERF_SAMPLE_SIZE];
+
+   assert(o->Ready);
+
+   if (perf_ioctl(brw->perfquery.perf_oa_event_fd,
+                  PERF_EVENT_IOC_FLUSH, 0) < 0)
+      DBG("Failed to flush outstanding perf events: %m\n");
+
+   drm_intel_bo_map(obj->oa.bo, false);
+   query_buffer = obj->oa.bo->virtual;
+
+   start = last = query_buffer;
+   end = query_buffer + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint32_t));
+
+   if (start[0] != obj->oa.current_report_id) {
+      DBG("Spurious start report id=%"PRIu32"\n", start[0]);
+      goto error;
+   }
+   if (end[0] != (obj->oa.current_report_id + 1)) {
+      DBG("Spurious end report id=%"PRIu32"\n", end[0]);
+      goto error;
+   }
+
+   start_timestamp = read_report_timestamp(brw, start);
+   end_timestamp = read_report_timestamp(brw, end);
+
+   head = read_perf_head(brw->perfquery.perf_oa_mmap_page);
+   tail = obj->oa.perf_tail;
+
+   while (TAKEN(head, tail, size)) {
+      const struct perf_event_header *header =
+         (const struct perf_event_header *)(data + (tail & mask));
+
+      assert(header->size != 0);
+      assert(header->size <= (head - tail));
+
+      if ((const uint8_t *)header + header->size > data + size) {
+         int before;
+
+         if (header->size > sizeof(scratch)) {
+            DBG("Spurious sample larger than expected\n");
+            goto error;
+         }
+
+         before = data + size - (const uint8_t *)header;
+
+         memcpy(scratch, header, before);
+         memcpy(scratch + before, data, header->size - before);
+
+         header = (struct perf_event_header *)scratch;
+      }
+
+      switch (header->type) {
+         case PERF_RECORD_LOST: {
+            struct {
+               struct perf_event_header header;
+               uint64_t id;
+               uint64_t n_lost;
+            } *lost = (void *)header;
+
+            DBG("i915_oa: Lost %" PRIu64 " events\n", lost->n_lost);
+            goto error;
+         }
+
+         case PERF_RECORD_THROTTLE:
+            DBG("i915_oa: Sampling has been throttled\n");
+            break;
+
+         case PERF_RECORD_UNTHROTTLE:
+            DBG("i915_oa: Sampling has been unthrottled\n");
+            break;
+
+         case PERF_RECORD_SAMPLE: {
+            struct oa_perf_sample *perf_sample = (struct oa_perf_sample *)header;
+            uint32_t *report = (uint32_t *)perf_sample->raw_data;
+            uint64_t timestamp = read_report_timestamp(brw, report);
+
+            if (timestamp >= end_timestamp)
+               goto end;
+
+            if (timestamp > start_timestamp) {
+               add_deltas(brw, obj, last, report);
+               last = report;
+            }
+
+            break;
+         }
+
+         case PERF_RECORD_DEVICE: {
+	    struct i915_oa_event {
+		struct perf_event_header header;
+		drm_i915_oa_event_header_t oa_header;
+	    } *oa_event = (void *)header;
+
+	    switch (oa_event->oa_header.type) {
+	    case I915_OA_RECORD_BUFFER_OVERFLOW:
+		DBG("i915_oa: OA buffer overflow\n");
+		break;
+	    case I915_OA_RECORD_REPORT_LOST:
+		DBG("i915_oa: OA report lost\n");
+		break;
+	    }
+
+	    break;
+         }
+
+         default:
+            DBG("i915_oa: Spurious header type = %d\n", header->type);
+      }
+
+      tail += header->size;
+   }
+
+end:
+
+   add_deltas(brw, obj, last, end);
+
+   DBG("Marking %d resolved - results gathered\n", o->Id);
+
+   drm_intel_bo_unmap(obj->oa.bo);
+   obj->oa.results_accumulated = true;
+   drop_from_unresolved_query_list(brw, obj);
+   dec_n_oa_users(brw);
+
+   return;
+
+error:
+
+   drm_intel_bo_unmap(obj->oa.bo);
+   discard_all_queries(brw);
+}
+
+/******************************************************************************/
+
+static uint64_t
+read_file_uint64 (const char *file)
+{
+   char buf[32];
+   int fd, n;
+
+   fd = open(file, 0);
+   if (fd < 0)
+      return 0;
+   n = read(fd, buf, sizeof (buf) - 1);
+   close(fd);
+   if (n < 0)
+      return 0;
+
+   buf[n] = '\0';
+   return strtoull(buf, 0, 0);
+}
+
+static uint64_t
+lookup_i915_oa_id (void)
+{
+   return read_file_uint64("/sys/bus/event_source/devices/i915_oa/type");
+}
+
+static long
+perf_event_open (struct perf_event_attr *hw_event,
+                 pid_t pid,
+                 int cpu,
+                 int group_fd,
+                 unsigned long flags)
+{
+   return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static bool
+open_i915_oa_event(struct brw_context *brw,
+                   int metrics_set,
+                   int report_format,
+                   int period_exponent,
+                   int drm_fd,
+                   uint32_t ctx_id)
+{
+   struct perf_event_attr attr;
+   drm_i915_oa_attr_t oa_attr;
+   int event_fd;
+   void *mmap_base;
+
+   memset(&attr, 0, sizeof(attr));
+   attr.size = sizeof(attr);
+   attr.type = lookup_i915_oa_id();
+
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.disabled = 1;
+   attr.sample_period = 1;
+
+   memset(&oa_attr, 0, sizeof(oa_attr));
+   oa_attr.size = sizeof(oa_attr);
+
+   oa_attr.format = report_format;
+   oa_attr.metrics_set = metrics_set;
+   oa_attr.timer_exponent = period_exponent;
+
+   oa_attr.single_context = true;
+   oa_attr.ctx_id = ctx_id;
+   oa_attr.drm_fd = drm_fd;
+
+   attr.config = (uint64_t)&oa_attr;
+
+   event_fd = perf_event_open(&attr,
+                              -1, /* pid */
+                              0, /* cpu */
+                              -1, /* group fd */
+                              PERF_FLAG_FD_CLOEXEC); /* flags */
+   if (event_fd == -1) {
+      DBG("Error opening i915_oa perf event: %m\n");
+      return false;
+   }
+
+   /* NB: A read-write mapping ensures the kernel will stop writing data when
+    * the buffer is full, and will report samples as lost. */
+   mmap_base = mmap(NULL,
+                    brw->perfquery.perf_oa_buffer_size + brw->perfquery.page_size,
+                    PROT_READ | PROT_WRITE, MAP_SHARED, event_fd, 0);
+   if (mmap_base == MAP_FAILED) {
+      DBG("Error mapping circular buffer, %m\n");
+      close (event_fd);
+      return false;
+   }
+
+   brw->perfquery.perf_oa_event_fd = event_fd;
+   brw->perfquery.perf_oa_mmap_base = mmap_base;
+   brw->perfquery.perf_oa_mmap_page = mmap_base;
+
+   brw->perfquery.perf_oa_metrics_set = metrics_set;
+   brw->perfquery.perf_oa_format = report_format;
+
+   return true;
+}
+
+/**
  * Driver hook for glBeginPerfQueryINTEL().
  */
 static GLboolean
@@ -271,6 +913,7 @@ brw_begin_perf_query(struct gl_context *ctx,
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_perf_query_object *obj = brw_perf_query(o);
+   const struct brw_perf_query *query = obj->query;
 
    assert(!o->Active);
    assert(!o->Used || o->Ready); /* no in-flight query to worry about */
@@ -278,6 +921,89 @@ brw_begin_perf_query(struct gl_context *ctx,
    DBG("Begin(%d)\n", o->Id);
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+      /* If the OA counters aren't already on, enable them. */
+      if (brw->perfquery.perf_oa_event_fd == -1) {
+         __DRIscreen *screen = brw->intelScreen->driScrnPriv;
+         uint32_t ctx_id = drm_intel_gem_context_get_context_id(brw->hw_ctx);
+         int period_exponent;
+
+         /* The timestamp for HSW+ increments every 80ns
+          *
+          * The period_exponent gives a sampling period as follows:
+          *   sample_period = 80ns * 2^(period_exponent + 1)
+          *
+          * The overflow period for Haswell can be calculated as:
+          *
+          * 2^32 / (n_eus * max_gen_freq * 2)
+          * (E.g. 40 EUs @ 1GHz = ~53ms)
+          *
+          * We currently sample every 42 milliseconds...
+          */
+         period_exponent = 18;
+
+         if (!open_i915_oa_event(brw,
+                                 query->oa_metrics_set,
+                                 query->oa_format,
+                                 period_exponent,
+                                 screen->fd, /* drm fd */
+                                 ctx_id))
+            return false;
+      } else {
+         /* Opening an i915_oa event fd implies exclusive access to
+          * the OA unit which will generate counter reports for a
+          * specific counter set with a specific layout/format so we
+          * can't begin any OA based queries that require a different
+          * counter set or format unless we get an opportunity to
+          * close the event fd and open a new one...
+          */
+         if (brw->perfquery.perf_oa_metrics_set != query->oa_metrics_set ||
+             brw->perfquery.perf_oa_format != query->oa_format)
+         {
+            return false;
+         }
+      }
+
+      if (!inc_n_oa_users(brw)) {
+         DBG("WARNING: Error enabling i915_oa perf event: %m\n");
+         return false;
+      }
+
+      if (obj->oa.bo) {
+         drm_intel_bo_unreference(obj->oa.bo);
+         obj->oa.bo = NULL;
+      }
+
+      obj->oa.bo =
+         drm_intel_bo_alloc(brw->bufmgr, "perf. query OA bo", 4096, 64);
+#ifdef DEBUG
+      /* Pre-filling the BO helps debug whether writes landed. */
+      drm_intel_bo_map(obj->oa.bo, true);
+      memset((char *) obj->oa.bo->virtual, 0x80, 4096);
+      drm_intel_bo_unmap(obj->oa.bo);
+#endif
+
+      obj->oa.current_report_id = brw->perfquery.next_query_start_report_id;
+      brw->perfquery.next_query_start_report_id += 2;
+
+      /* Take a starting OA counter snapshot. */
+      emit_mi_report_perf_count(brw, obj->oa.bo, 0,
+                                obj->oa.current_report_id);
+      ++brw->perfquery.n_active_oa_queries;
+
+      /* Each unresolved query maintains a separate tail pointer into the
+       * circular perf sample buffer. The real tail pointer in
+       * perfquery.perf_oa_mmap_page.data_tail will correspond to the query
+       * tail that is furthest behind.
+       */
+      obj->oa.perf_tail = read_perf_head(brw->perfquery.perf_oa_mmap_page);
+
+      memset(obj->oa.accumulator, 0, sizeof(obj->oa.accumulator));
+      obj->oa.results_accumulated = false;
+
+      add_to_unresolved_query_list(brw, obj);
+      break;
+
    case PIPELINE_STATS:
       if (obj->pipeline_stats.bo) {
          drm_intel_bo_unreference(obj->pipeline_stats.bo);
@@ -297,6 +1023,9 @@ brw_begin_perf_query(struct gl_context *ctx,
       break;
    }
 
+   if (INTEL_DEBUG & DEBUG_PERFMON)
+      brw_dump_perf_queries(brw);
+
    return true;
 }
 
@@ -313,6 +1042,27 @@ brw_end_perf_query(struct gl_context *ctx,
    DBG("End(%d)\n", o->Id);
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+
+      /* NB: It's possible that the query will have already been marked
+       * as 'accumulated' if an error was seen while reading samples
+       * from perf. In this case we mustn't try and emit a closing
+       * MI_RPC command in case the OA unit has already been disabled
+       */
+      if (!obj->oa.results_accumulated) {
+         /* Take an ending OA counter snapshot. */
+         emit_mi_report_perf_count(brw, obj->oa.bo,
+                                   SECOND_SNAPSHOT_OFFSET_IN_BYTES,
+                                   obj->oa.current_report_id + 1);
+      }
+
+      --brw->perfquery.n_active_oa_queries;
+
+      /* NB: even though the query has now ended, it can't be resolved
+       * until the end MI_REPORT_PERF_COUNT snapshot has been written
+       * to query->oa.bo */
+      break;
+
    case PIPELINE_STATS:
       /* Take ending snapshots. */
       snapshot_statistics_registers(brw, obj,
@@ -332,6 +1082,10 @@ brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
    assert(!o->Ready);
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+      bo = obj->oa.bo;
+      break;
+
    case PIPELINE_STATS:
       bo = obj->pipeline_stats.bo;
       break;
@@ -367,6 +1121,12 @@ brw_is_perf_query_ready(struct gl_context *ctx,
       return true;
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+      return (obj->oa.results_accumulated ||
+              (obj->oa.bo &&
+               !drm_intel_bo_references(brw->batch.bo, obj->oa.bo) &&
+               !drm_intel_bo_busy(obj->oa.bo)));
+
    case PIPELINE_STATS:
       return (obj->pipeline_stats.bo &&
               !drm_intel_bo_references(brw->batch.bo, obj->pipeline_stats.bo) &&
@@ -378,6 +1138,34 @@ brw_is_perf_query_ready(struct gl_context *ctx,
 }
 
 static int
+get_oa_counter_data(struct brw_context *brw,
+                    struct brw_perf_query_object *obj,
+                    size_t data_size,
+                    uint8_t *data)
+{
+   const struct brw_perf_query *query = obj->query;
+   int n_counters = query->n_counters;
+   int written = 0;
+
+   if (!obj->oa.results_accumulated) {
+      accumulate_oa_snapshots(brw, obj);
+      assert(obj->oa.results_accumulated);
+   }
+
+   for (int i = 0; i < n_counters; i++) {
+      const struct brw_perf_query_counter *counter = &query->counters[i];
+
+      if (counter->size) {
+         counter->oa_counter->read(counter->oa_counter, obj->oa.accumulator,
+                                   data + counter->offset);
+         written = counter->offset + counter->size;
+      }
+   }
+
+   return written;
+}
+
+static int
 get_pipeline_stats_data(struct brw_context *brw,
                         struct brw_perf_query_object *obj,
                         size_t data_size,
@@ -420,12 +1208,18 @@ brw_get_perf_query_data(struct gl_context *ctx,
    assert(brw_is_perf_query_ready(ctx, o));
 
    DBG("GetData(%d)\n", o->Id);
-   brw_dump_perf_queries(brw);
+
+   if (INTEL_DEBUG & DEBUG_PERFMON)
+      brw_dump_perf_queries(brw);
 
    /* This hook should only be called when results are available. */
    assert(o->Ready);
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+      written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data);
+      break;
+
    case PIPELINE_STATS:
       written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);
       break;
@@ -448,9 +1242,28 @@ brw_new_perf_query_object(struct gl_context *ctx, int query_index)
 
    obj->query = query;
 
+   brw->perfquery.n_query_instances++;
+
    return &obj->base;
 }
 
+static void
+close_perf(struct brw_context *brw)
+{
+   if (brw->perfquery.perf_oa_event_fd != -1) {
+      if (brw->perfquery.perf_oa_mmap_base) {
+         size_t mapping_len =
+            brw->perfquery.perf_oa_buffer_size + brw->perfquery.page_size;
+
+         munmap(brw->perfquery.perf_oa_mmap_base, mapping_len);
+         brw->perfquery.perf_oa_mmap_base = NULL;
+      }
+
+      close(brw->perfquery.perf_oa_event_fd);
+      brw->perfquery.perf_oa_event_fd = -1;
+   }
+}
+
 /**
  * Delete a performance query object.
  */
@@ -458,6 +1271,7 @@ static void
 brw_delete_perf_query(struct gl_context *ctx,
                       struct gl_perf_query_object *o)
 {
+   struct brw_context *brw = brw_context(ctx);
    struct brw_perf_query_object *obj = brw_perf_query(o);
 
    assert(!o->Active);
@@ -466,6 +1280,20 @@ brw_delete_perf_query(struct gl_context *ctx,
    DBG("Delete(%d)\n", o->Id);
 
    switch(obj->query->kind) {
+   case OA_COUNTERS:
+      if (obj->oa.bo) {
+         if (!obj->oa.results_accumulated) {
+            drop_from_unresolved_query_list(brw, obj);
+            dec_n_oa_users(brw);
+         }
+
+         drm_intel_bo_unreference(obj->oa.bo);
+         obj->oa.bo = NULL;
+      }
+
+      obj->oa.results_accumulated = false;
+      break;
+
    case PIPELINE_STATS:
       if (obj->pipeline_stats.bo) {
          drm_intel_bo_unreference(obj->pipeline_stats.bo);
@@ -478,6 +1306,504 @@ brw_delete_perf_query(struct gl_context *ctx,
    }
 
    free(obj);
+
+   if (--brw->perfquery.n_query_instances == 0)
+      close_perf(brw);
+}
+
+/******************************************************************************/
+
+/* Type safe wrapper for reading OA counter values */
+
+static uint64_t
+read_uint64_oa_counter(struct brw_oa_counter *counter, uint64_t *accumulated)
+{
+   uint64_t value;
+
+   assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL);
+
+   counter->read(counter, accumulated, &value);
+
+   return value;
+}
+
+/******************************************************************************/
+
+/*
+ * OA counter normalisation support...
+ */
+
+static void
+read_accumulated_oa_counter_cb(struct brw_oa_counter *counter,
+                               uint64_t *accumulator,
+                               void *value)
+{
+   *((uint64_t *)value) = accumulator[counter->accumulator_index];
+}
+
+static void
+accumulate_uint32_cb(struct brw_oa_counter *counter,
+                     uint32_t *report0,
+                     uint32_t *report1,
+                     uint64_t *accumulator)
+{
+   accumulator[counter->accumulator_index] +=
+      (uint32_t)(report1[counter->report_offset] -
+                 report0[counter->report_offset]);
+}
+
+static struct brw_oa_counter *
+add_raw_oa_counter(struct brw_query_builder *builder, int report_offset)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->report_offset = report_offset;
+   counter->accumulator_index = builder->next_accumulator_index++;
+   counter->accumulate = accumulate_uint32_cb;
+   counter->read = read_accumulated_oa_counter_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+   return counter;
+}
+
+static uint64_t
+hsw_read_report_timestamp(uint32_t *report)
+{
+   /* The least significant timestamp bit represents 80ns on Haswell */
+   return ((uint64_t)report[1]) * 80;
+}
+
+static void
+accumulate_hsw_elapsed_cb(struct brw_oa_counter *counter,
+                          uint32_t *report0,
+                          uint32_t *report1,
+                          uint64_t *accumulator)
+{
+   uint64_t timestamp0 = hsw_read_report_timestamp(report0);
+   uint64_t timestamp1 = hsw_read_report_timestamp(report1);
+
+   accumulator[counter->accumulator_index] += (timestamp1 - timestamp0);
+}
+
+static struct brw_oa_counter *
+add_hsw_elapsed_oa_counter(struct brw_query_builder *builder)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->accumulator_index = builder->next_accumulator_index++;
+   counter->accumulate = accumulate_hsw_elapsed_cb;
+   counter->read = read_accumulated_oa_counter_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+   return counter;
+}
+
+static void
+read_frequency_cb(struct brw_oa_counter *counter,
+                  uint64_t *accumulated,
+                  void *value) /* uint64 */
+{
+   uint64_t clk_delta = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t time_delta = read_uint64_oa_counter(counter->reference1, accumulated);
+   uint64_t *ret = value;
+
+   if (!clk_delta) {
+      *ret = 0;
+      return;
+   }
+
+   *ret = (clk_delta * 1000) / time_delta;
+}
+
+static struct brw_oa_counter *
+add_avg_frequency_oa_counter(struct brw_query_builder *builder,
+                             struct brw_oa_counter *timestamp)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   assert(timestamp->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL);
+
+   counter->reference0 = builder->gpu_core_clock;
+   counter->reference1 = timestamp;
+   counter->read = read_frequency_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+   return counter;
+}
+
+static void
+read_oa_counter_normalized_by_gpu_duration_cb(struct brw_oa_counter *counter,
+                                              uint64_t *accumulated,
+                                              void *value) /* float */
+{
+   uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t clk_delta = read_uint64_oa_counter(counter->reference1, accumulated);
+   float *ret = value;
+
+   if (!clk_delta) {
+      *ret = 0;
+      return;
+   }
+
+   *ret = ((double)delta * 100.0) / (double)clk_delta;
+}
+
+static struct brw_oa_counter *
+add_oa_counter_normalised_by_gpu_duration(struct brw_query_builder *builder,
+                                          struct brw_oa_counter *raw)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = raw;
+   counter->reference1 = builder->gpu_core_clock;
+   counter->read = read_oa_counter_normalized_by_gpu_duration_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+   return counter;
+}
+
+static void
+read_oa_counter_normalized_by_eu_duration_cb(struct brw_oa_counter *counter,
+                                             uint64_t *accumulated,
+                                             void *value) /* float */
+{
+   uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t clk_delta = read_uint64_oa_counter(counter->reference1, accumulated);
+   float *ret = value;
+
+   if (!clk_delta) {
+      *ret = 0;
+      return;
+   }
+
+   delta /= counter->config; /* EU count */
+
+   *ret = (double)delta * 100.0 / (double)clk_delta;
+}
+
+static struct brw_oa_counter *
+add_oa_counter_normalised_by_eu_duration(struct brw_query_builder *builder,
+                                         struct brw_oa_counter *raw)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = raw;
+   counter->reference1 = builder->gpu_core_clock;
+   counter->config = builder->brw->perfquery.eu_count;
+   counter->read = read_oa_counter_normalized_by_eu_duration_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+
+   return counter;
+}
+
+static void
+read_av_thread_cycles_counter_cb(struct brw_oa_counter *counter,
+                                 uint64_t *accumulated,
+                                 void *value) /* uint64 */
+{
+   uint64_t delta = read_uint64_oa_counter(counter->reference0, accumulated);
+   uint64_t spawned = read_uint64_oa_counter(counter->reference1, accumulated);
+   uint64_t *ret = value;
+
+   if (!spawned) {
+      *ret = 0;
+      return;
+   }
+
+   *ret = delta / spawned;
+}
+
+static struct brw_oa_counter *
+add_average_thread_cycles_oa_counter(struct brw_query_builder *builder,
+                                     struct brw_oa_counter *raw,
+                                     struct brw_oa_counter *denominator)
+{
+   struct brw_oa_counter *counter =
+      &builder->query->oa_counters[builder->query->n_oa_counters++];
+
+   counter->reference0 = raw;
+   counter->reference1 = denominator;
+   counter->read = read_av_thread_cycles_counter_cb;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+
+   return counter;
+}
+
+static void
+report_uint64_oa_counter_as_raw_uint64(struct brw_query_builder *builder,
+                                       const char *name,
+                                       const char *desc,
+                                       struct brw_oa_counter *oa_counter)
+{
+   struct brw_perf_query_counter *counter =
+      &builder->query->counters[builder->query->n_counters++];
+
+   counter->oa_counter = oa_counter;
+   counter->name = name;
+   counter->desc = desc;
+   counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+   counter->raw_max = 0; /* undefined range */
+   counter->offset = pot_align(builder->offset, 8);
+   counter->size = sizeof(uint64_t);
+
+   builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_uint64_oa_counter_as_uint64_event(struct brw_query_builder *builder,
+                                         const char *name,
+                                         const char *desc,
+                                         struct brw_oa_counter *oa_counter)
+{
+   struct brw_perf_query_counter *counter =
+      &builder->query->counters[builder->query->n_counters++];
+
+   counter->oa_counter = oa_counter;
+   counter->name = name;
+   counter->desc = desc;
+   counter->type = GL_PERFQUERY_COUNTER_EVENT_INTEL;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+   counter->offset = pot_align(builder->offset, 8);
+   counter->size = sizeof(uint64_t);
+
+   builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_float_oa_counter_as_percentage_duration(struct brw_query_builder *builder,
+                                               const char *name,
+                                               const char *desc,
+                                               struct brw_oa_counter *oa_counter)
+{
+   struct brw_perf_query_counter *counter =
+      &builder->query->counters[builder->query->n_counters++];
+
+   counter->oa_counter = oa_counter;
+   counter->name = name;
+   counter->desc = desc;
+   counter->type = GL_PERFQUERY_COUNTER_DURATION_RAW_INTEL;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_FLOAT_INTEL;
+   counter->raw_max = 100;
+   counter->offset = pot_align(builder->offset, 4);
+   counter->size = sizeof(float);
+
+   builder->offset = counter->offset + counter->size;
+}
+
+static void
+report_uint64_oa_counter_as_duration(struct brw_query_builder *builder,
+                                     const char *name,
+                                     const char *desc,
+                                     struct brw_oa_counter *oa_counter)
+{
+   struct brw_perf_query_counter *counter =
+      &builder->query->counters[builder->query->n_counters++];
+
+   counter->oa_counter = oa_counter;
+   counter->name = name;
+   counter->desc = desc;
+   counter->type = GL_PERFQUERY_COUNTER_DURATION_RAW_INTEL;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+   counter->raw_max = 0;
+   counter->offset = pot_align(builder->offset, 8);
+   counter->size = sizeof(uint64_t);
+
+   builder->offset = counter->offset + counter->size;
+}
+
+static void
+add_pipeline_stage_counters(struct brw_query_builder *builder,
+                            const char *short_name,
+                            const char *long_name,
+                            int aggregate_active_counter,
+                            int aggregate_stall_counter,
+                            int n_threads_counter)
+{
+   struct brw_oa_counter *active, *stall, *n_threads, *c;
+   char *short_desc;
+   char *long_desc;
+
+
+   short_desc = ralloc_asprintf(builder->brw, "%s EU Active", short_name);
+   long_desc = ralloc_asprintf(builder->brw,
+                               "The percentage of time in which %s were "
+                               "processed actively on the EUs.", long_name);
+   active = add_raw_oa_counter(builder, aggregate_active_counter);
+   c = add_oa_counter_normalised_by_eu_duration(builder, active);
+   report_float_oa_counter_as_percentage_duration(builder, short_desc, long_desc, c);
+
+
+   short_desc = ralloc_asprintf(builder->brw, "%s EU Stall", short_name);
+   long_desc = ralloc_asprintf(builder->brw,
+                               "The percentage of time in which %s were "
+                               "stalled on the EUs.", long_name);
+   stall = add_raw_oa_counter(builder, aggregate_stall_counter);
+   c = add_oa_counter_normalised_by_eu_duration(builder, stall);
+   report_float_oa_counter_as_percentage_duration(builder, short_desc, long_desc, c);
+
+
+   n_threads = add_raw_oa_counter(builder, n_threads_counter);
+
+   short_desc = ralloc_asprintf(builder->brw, "%s AVG Active per Thread",
+                                short_name);
+   long_desc = ralloc_asprintf(builder->brw,
+                               "The average number of cycles per hardware "
+                               "thread run in which %s were processed actively "
+                               "on the EUs.", long_name);
+   c = add_average_thread_cycles_oa_counter(builder, active, n_threads);
+   report_uint64_oa_counter_as_raw_uint64(builder, short_desc, long_desc, c);
+
+
+   short_desc = ralloc_asprintf(builder->brw, "%s AVG Stall per Thread",
+                                short_name);
+   long_desc = ralloc_asprintf(builder->brw,
+                               "The average number of cycles per hardware "
+                               "thread run in which %s were stalled "
+                               "on the EUs.", long_name);
+   c = add_average_thread_cycles_oa_counter(builder, stall, n_threads);
+   report_uint64_oa_counter_as_raw_uint64(builder, short_desc, long_desc, c);
+}
+
+static void
+add_aggregate_counters(struct brw_query_builder *builder)
+{
+   struct brw_oa_counter *raw;
+   struct brw_oa_counter *c;
+   int a_offset = builder->a_offset;
+
+   raw = add_raw_oa_counter(builder, a_offset + 41);
+   c = add_oa_counter_normalised_by_gpu_duration(builder, raw);
+   report_float_oa_counter_as_percentage_duration(builder,
+                                                  "GPU Busy",
+                                                  "The percentage of time in which the GPU has being processing GPU commands.",
+                                                  c);
+
+   raw = add_raw_oa_counter(builder, a_offset); /* aggregate EU active */
+   c = add_oa_counter_normalised_by_eu_duration(builder, raw);
+   report_float_oa_counter_as_percentage_duration(builder,
+                                                   "EU Active",
+                                                   "The percentage of time in which the Execution Units were actively processing.",
+                                                   c);
+
+   raw = add_raw_oa_counter(builder, a_offset + 1); /* aggregate EU stall */
+   c = add_oa_counter_normalised_by_eu_duration(builder, raw);
+   report_float_oa_counter_as_percentage_duration(builder,
+                                                   "EU Stall",
+                                                   "The percentage of time in which the Execution Units were stalled.",
+                                                   c);
+
+   add_pipeline_stage_counters(builder,
+                               "VS",
+                               "vertex shaders",
+                               a_offset + 2, /* aggregate active */
+                               a_offset + 3, /* aggregate stall */
+                               a_offset + 5); /* n threads loaded */
+
+   /* Not currently supported by Mesa... */
+#if 0
+   add_pipeline_stage_counters(builder,
+                               "HS",
+                               "hull shaders",
+                               a_offset + 7, /* aggregate active */
+                               a_offset + 8, /* aggregate stall */
+                               a_offset + 10); /* n threads loaded */
+
+   add_pipeline_stage_counters(builder,
+                               "DS",
+                               "domain shaders",
+                               a_offset + 12, /* aggregate active */
+                               a_offset + 13, /* aggregate stall */
+                               a_offset + 15); /* n threads loaded */
+
+   add_pipeline_stage_counters(builder,
+                               "CS",
+                               "compute shaders",
+                               a_offset + 17, /* aggregate active */
+                               a_offset + 18, /* aggregate stall */
+                               a_offset + 20); /* n threads loaded */
+#endif
+
+   add_pipeline_stage_counters(builder,
+                               "GS",
+                               "geometry shaders",
+                               a_offset + 22, /* aggregate active */
+                               a_offset + 23, /* aggregate stall */
+                               a_offset + 25); /* n threads loaded */
+
+   add_pipeline_stage_counters(builder,
+                               "PS",
+                               "pixel shaders",
+                               a_offset + 27, /* aggregate active */
+                               a_offset + 28, /* aggregate stall */
+                               a_offset + 30); /* n threads loaded */
+
+   raw = add_raw_oa_counter(builder, a_offset + 32); /* hiz fast z passing */
+   raw = add_raw_oa_counter(builder, a_offset + 33); /* hiz fast z failing */
+
+   raw = add_raw_oa_counter(builder, a_offset + 42); /* vs bottleneck */
+   raw = add_raw_oa_counter(builder, a_offset + 43); /* gs bottleneck */
+}
+
+static void
+hsw_add_basic_oa_counter_query(struct brw_context *brw)
+{
+   struct brw_query_builder builder;
+   struct brw_perf_query *query =
+      &brw->perfquery.queries[brw->perfquery.n_queries++];
+   struct brw_oa_counter *elapsed;
+   struct brw_oa_counter *c;
+   struct brw_perf_query_counter *last;
+   int a_offset = 3; /* A0 */
+   int b_offset = a_offset + 45; /* B0 */
+
+   query->kind = OA_COUNTERS;
+   query->name = "Gen7 Basic Observability Architecture Counters";
+   query->counters = rzalloc_array(brw, struct brw_perf_query_counter,
+                                   MAX_PERF_QUERY_COUNTERS);
+   query->n_counters = 0;
+   query->oa_counters = rzalloc_array(brw, struct brw_oa_counter,
+                                      MAX_OA_QUERY_COUNTERS);
+   query->n_oa_counters = 0;
+   query->oa_metrics_set = 0; /* default */
+   query->oa_format = I915_OA_FORMAT_A45_B8_C8_HSW;
+
+   builder.brw = brw;
+   builder.query = query;
+   builder.offset = 0;
+   builder.next_accumulator_index = 0;
+
+   builder.a_offset = a_offset;
+   builder.b_offset = b_offset;
+   builder.c_offset = -1;
+
+   /* Can be referenced by other counters... */
+   builder.gpu_core_clock = add_raw_oa_counter(&builder, b_offset);
+
+   elapsed = add_hsw_elapsed_oa_counter(&builder);
+   report_uint64_oa_counter_as_duration(&builder,
+                                        "GPU Time Elapsed",
+                                        "Time elapsed on the GPU during the measurement.",
+                                        elapsed);
+
+   c = add_avg_frequency_oa_counter(&builder, elapsed);
+   report_uint64_oa_counter_as_uint64_event(&builder,
+                                            "AVG GPU Core Frequency",
+                                            "Average GPU Core Frequency in the measurement.",
+                                            c);
+
+   add_aggregate_counters(&builder);
+
+   assert(query->n_counters < MAX_PERF_QUERY_COUNTERS);
+   assert(query->n_oa_counters < MAX_OA_QUERY_COUNTERS);
+
+   last = &query->counters[query->n_counters - 1];
+   query->data_size = last->offset + last->size;
 }
 
 #define SCALED_NAMED_STAT(REG, NUM, DEN, NAME, DESC)        \
@@ -595,6 +1921,8 @@ brw_init_performance_queries(struct brw_context *brw)
    ctx->Driver.IsPerfQueryReady = brw_is_perf_query_ready;
    ctx->Driver.GetPerfQueryData = brw_get_perf_query_data;
 
+   brw->perfquery.eu_count = get_eu_count(brw);
+
    if (brw->gen == 6) {
       add_pipeline_statistics_query(brw, "Gen6 Pipeline Statistics Registers",
                                     gen6_pipeline_statistics,
@@ -605,7 +1933,24 @@ brw_init_performance_queries(struct brw_context *brw)
                                     gen7_pipeline_statistics,
                                     (sizeof(gen7_pipeline_statistics)/
                                      sizeof(gen7_pipeline_statistics[0])));
+
+      if (brw->is_haswell) {
+         brw->perfquery.read_oa_report_timestamp = hsw_read_report_timestamp;
+         hsw_add_basic_oa_counter_query(brw);
+      }
    }
 
    ctx->PerfQuery.NumQueries = brw->perfquery.n_queries;
+
+   brw->perfquery.unresolved =
+      ralloc_array(brw, struct brw_perf_query_object *, 2);
+   brw->perfquery.unresolved_elements = 0;
+   brw->perfquery.unresolved_array_size = 2;
+
+   brw->perfquery.page_size = sysconf(_SC_PAGE_SIZE);
+
+   brw->perfquery.perf_oa_event_fd = -1;
+   brw->perfquery.perf_oa_buffer_size = 1024 * 1024; /* NB: must be power of two */
+
+   brw->perfquery.next_query_start_report_id = 1000;
 }
-- 
2.3.2



More information about the mesa-dev mailing list