[Mesa-dev] [RFC 4/6] i965: Implement INTEL_performance_query extension

Mon May 11 08:28:02 PDT 2015

On 05/06/2015 02:53 AM, Robert Bragg wrote:
> This adds a bare-bones backend for the INTEL_performance_query extension
> that exposes the pipeline statistics on gen 6 and 7 hardware.
>
> Although this could be considered redundant given that the same
> statistics are now available via query objects, they are a simple
> starting point for this extension and it's expected to be convenient for
> tools wanting to have a single go to api to introspect what performance
> counters are available, along with names, descriptions and semantic/data
> types.
>
> This code is derived from Kenneth Graunke's work, temporarily removed
> while the frontend and backend interface were reworked.
>
> Signed-off-by: Robert Bragg <robert at sixbynine.org>
> ---
>   src/mesa/drivers/dri/i965/Makefile.sources        |   1 +
>   src/mesa/drivers/dri/i965/brw_context.c           |   3 +
>   src/mesa/drivers/dri/i965/brw_context.h           |  26 +
>   src/mesa/drivers/dri/i965/brw_performance_query.c | 611 ++++++++++++++++++++++
>   src/mesa/drivers/dri/i965/intel_extensions.c      |   3 +
>   5 files changed, 644 insertions(+)
>   create mode 100644 src/mesa/drivers/dri/i965/brw_performance_query.c
>
> diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
> index 210314b..066364a 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.sources
> +++ b/src/mesa/drivers/dri/i965/Makefile.sources
> @@ -81,6 +81,7 @@ i965_FILES = \
>   	brw_nir_analyze_boolean_resolves.c \
>   	brw_object_purgeable.c \
>   	brw_packed_float.c \
> +	brw_performance_query.c \
>   	brw_primitive_restart.c \
>   	brw_program.c \
>   	brw_program.h \
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 80a4b0a..1350bc1 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -884,6 +884,9 @@ brwCreateContext(gl_api api,
>      _mesa_initialize_dispatch_tables(ctx);
>      _mesa_initialize_vbo_vtxfmt(ctx);
>   
> +   if (ctx->Extensions.INTEL_performance_query)
> +      brw_init_performance_queries(brw);
> +
>      vbo_use_buffer_objects(ctx);
>      vbo_always_unmap_buffers(ctx);
>   
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index db65191..2cd963d 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -953,6 +953,21 @@ struct brw_stage_state
>      uint32_t sampler_offset;
>   };
>   
> +enum brw_query_kind {
> +   PIPELINE_STATS
> +};
> +
> +struct brw_perf_query
> +{
> +   enum brw_query_kind kind;
> +   const char *name;
> +   struct brw_perf_query_counter *counters;
> +   int n_counters;
> +   size_t data_size;
> +};
> +
> +#define MAX_PERF_QUERIES 3
> +#define MAX_PERF_QUERY_COUNTERS 150
>   
>   /**
>    * brw_context is derived from gl_context.
> @@ -1380,6 +1395,13 @@ struct brw_context
>         bool begin_emitted;
>      } query;
>   
> +   struct {
> +      struct brw_perf_query queries[MAX_PERF_QUERIES];

Why the number of active queries is limited to 3? Is that a hardware 
limitation?

> +      int n_queries;
> +
> +      int n_active_pipeline_stats_queries;
> +   } perfquery;
> +
>      int num_atoms[BRW_NUM_PIPELINES];
>      const struct brw_tracked_state render_atoms[57];
>      const struct brw_tracked_state compute_atoms[1];
> @@ -1656,6 +1678,10 @@ bool brw_render_target_supported(struct brw_context *brw,
>                                    struct gl_renderbuffer *rb);
>   uint32_t brw_depth_format(struct brw_context *brw, mesa_format format);
>   
> +/* brw_performance_query.c */
> +void brw_init_performance_queries(struct brw_context *brw);
> +void brw_dump_perf_queries(struct brw_context *brw);
> +
>   /* intel_buffer_objects.c */
>   int brw_bo_map(struct brw_context *brw, drm_intel_bo *bo, int write_enable,
>                  const char *bo_name);
> diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
> new file mode 100644
> index 0000000..38447e8
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
> @@ -0,0 +1,611 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file brw_performance_query.c
> + *
> + * Implementation of the GL_INTEL_performance_query extension.
> + *
> + * Currently this driver only exposes the 64bit Pipeline Statistics Registers
> + * available with Gen6 and Gen7.5, with support for Observability Counters
> + * to be added later for Gen7.5+
> + */
> +
> +#include <linux/perf_event.h>
> +
> +#include <limits.h>
> +
> +#include <asm/unistd.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <sys/mman.h>
> +#include <sys/ioctl.h>
> +
> +#include "main/hash.h"
> +#include "main/macros.h"
> +#include "main/mtypes.h"
> +#include "main/performance_query.h"
> +
> +#include "util/bitset.h"
> +#include "util/ralloc.h"
> +
> +#include "brw_context.h"
> +#include "brw_defines.h"
> +#include "intel_batchbuffer.h"
> +
> +#define FILE_DEBUG_FLAG DEBUG_PERFMON
> +
> +struct brw_pipeline_stat
> +{
> +   uint32_t reg;
> +   uint32_t numerator;
> +   uint32_t denominator;
> +};
> +
> +/* A counter that will be advertised and reported to applications */
> +struct brw_perf_query_counter
> +{
> +   const char *name;
> +   const char *desc;
> +   GLenum type;
> +   GLenum data_type;
> +   uint64_t raw_max;
> +   size_t offset;
> +   size_t size;
> +
> +   struct brw_pipeline_stat pipeline_stat;
> +};
> +
> +/**
> + * i965 representation of a performance query object.
> + *
> + * NB: We want to keep this structure relatively lean considering that
> + * applications may expect to allocate enough objects to be able to
> + * query around all draw calls in a frame.
> + */
> +struct brw_perf_query_object
> +{
> +   /** The base class. */
> +   struct gl_perf_query_object base;
> +
> +   const struct brw_perf_query *query;
> +
> +   struct {
> +      /**
> +       * BO containing starting and ending snapshots for the
> +       * statistics counters.
> +       */
> +      drm_intel_bo *bo;
> +
> +      /**
> +       * Storage for final pipeline statistics counter results.
> +       */
> +      uint64_t *results;
> +
> +   } pipeline_stats;
> +};
> +
> +/** Downcasting convenience macro. */
> +static inline struct brw_perf_query_object *
> +brw_perf_query(struct gl_perf_query_object *o)
> +{
> +   return (struct brw_perf_query_object *) o;
> +}
> +
> +#define SECOND_SNAPSHOT_OFFSET_IN_BYTES 2048
> +
> +/******************************************************************************/
> +
> +static GLboolean brw_is_perf_query_ready(struct gl_context *,
> +					 struct gl_perf_query_object *);
> +
> +static void
> +dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
> +{
> +   struct gl_perf_query_object *o = query_void;
> +   struct brw_perf_query_object *obj = query_void;
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      DBG("%4d: %-6s %-8s BO: %-4s\n",
> +          id,
> +          o->Used ? "Dirty," : "New,",
> +          o->Active ? "Active," : (o->Ready ? "Ready," : "Pending,"),
> +          obj->pipeline_stats.bo ? "yes" : "no");
> +      break;

Maybe you should add a call to unreachable() for the default case here.

> +   }
> +}
> +
> +void
> +brw_dump_perf_queries(struct brw_context *brw)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +   DBG("Queries: (Open queries = %d)\n",
> +       brw->perfquery.n_active_pipeline_stats_queries);
> +   _mesa_HashWalk(ctx->PerfQuery.Objects, dump_perf_query_callback, brw);
> +}
> +
> +/******************************************************************************/
> +
> +static void
> +brw_get_perf_query_info(struct gl_context *ctx,
> +                        int query_index,
> +                        const char **name,
> +                        GLuint *data_size,
> +                        GLuint *n_counters,
> +                        GLuint *n_active)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   const struct brw_perf_query *query = &brw->perfquery.queries[query_index];

Are you sure that the query_index thing won't overflow?

> +
> +   *name = query->name;
> +   *data_size = query->data_size;
> +   *n_counters = query->n_counters;
> +
> +   switch(query->kind) {
> +   case PIPELINE_STATS:
> +      *n_active = brw->perfquery.n_active_pipeline_stats_queries;
> +      break;
> +   }
> +}
> +
> +static void
> +brw_get_perf_counter_info(struct gl_context *ctx,
> +                          int query_index,
> +                          int counter_index,
> +                          const char **name,
> +                          const char **desc,
> +                          GLuint *offset,
> +                          GLuint *data_size,
> +                          GLuint *type_enum,
> +                          GLuint *data_type_enum,
> +                          GLuint64 *raw_max)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   const struct brw_perf_query *query = &brw->perfquery.queries[query_index];
> +   const struct brw_perf_query_counter *counter =
> +      &query->counters[counter_index];
> +
> +   *name = counter->name;
> +   *desc = counter->desc;
> +   *offset = counter->offset;
> +   *data_size = counter->size;
> +   *type_enum = counter->type;
> +   *data_type_enum = counter->data_type;
> +   *raw_max = counter->raw_max;
> +}
> +
> +/**
> + * Take a snapshot of any queried pipeline statistics counters.
> + */
> +static void
> +snapshot_statistics_registers(struct brw_context *brw,
> +                              struct brw_perf_query_object *obj,
> +                              uint32_t offset_in_bytes)
> +{
> +   const int offset = offset_in_bytes / sizeof(uint64_t);
> +   const struct brw_perf_query *query = obj->query;
> +   const int n_counters = query->n_counters;
> +
> +   intel_batchbuffer_emit_mi_flush(brw);
> +
> +   for (int i = 0; i < n_counters; i++) {
> +      const struct brw_perf_query_counter *counter = &query->counters[i];
> +
> +      assert(counter->data_type == GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL);
> +
> +      brw_store_register_mem64(brw, obj->pipeline_stats.bo,
> +                               counter->pipeline_stat.reg,
> +                               offset + i);
> +   }
> +}
> +
> +/**
> + * Gather results from pipeline_stats_bo, storing the final values.
> + *
> + * This allows us to free pipeline_stats_bo (which is 4K) in favor of a much
> + * smaller array of final results.
> + */
> +static void
> +gather_statistics_results(struct brw_context *brw,
> +                          struct brw_perf_query_object *obj)
> +{
> +   const struct brw_perf_query *query = obj->query;
> +   const int n_counters = query->n_counters;
> +
> +   obj->pipeline_stats.results = calloc(n_counters, sizeof(uint64_t));
> +   if (obj->pipeline_stats.results == NULL) {
> +      _mesa_error_no_memory(__func__);
> +      return;
> +   }
> +
> +   drm_intel_bo_map(obj->pipeline_stats.bo, false);
> +   uint64_t *start = obj->pipeline_stats.bo->virtual;
> +   uint64_t *end = start + (SECOND_SNAPSHOT_OFFSET_IN_BYTES / sizeof(uint64_t));
> +
> +   for (int i = 0; i < n_counters; i++) {
> +      const struct brw_perf_query_counter *counter = &query->counters[i];
> +      obj->pipeline_stats.results[i] = end[i] - start[i];
> +
> +      if (counter->pipeline_stat.numerator !=
> +          counter->pipeline_stat.denominator) {
> +         obj->pipeline_stats.results[i] *= counter->pipeline_stat.numerator;
> +         obj->pipeline_stats.results[i] /= counter->pipeline_stat.denominator;
> +      }
> +   }
> +
> +   drm_intel_bo_unmap(obj->pipeline_stats.bo);
> +   drm_intel_bo_unreference(obj->pipeline_stats.bo);
> +   obj->pipeline_stats.bo = NULL;
> +}
> +
> +/******************************************************************************/
> +
> +/**
> + * Driver hook for glBeginPerfQueryINTEL().
> + */
> +static GLboolean
> +brw_begin_perf_query(struct gl_context *ctx,
> +                     struct gl_perf_query_object *o)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_perf_query_object *obj = brw_perf_query(o);
> +
> +   assert(!o->Active);
> +   assert(!o->Used || o->Ready); /* no in-flight query to worry about */
> +
> +   DBG("Begin(%d)\n", o->Id);
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      if (obj->pipeline_stats.bo) {
> +         drm_intel_bo_unreference(obj->pipeline_stats.bo);
> +         obj->pipeline_stats.bo = NULL;
> +      }
> +
> +      obj->pipeline_stats.bo =
> +         drm_intel_bo_alloc(brw->bufmgr, "perf. query stats bo", 4096, 64);
> +
> +      /* Take starting snapshots. */
> +      snapshot_statistics_registers(brw, obj, 0);
> +
> +      free(obj->pipeline_stats.results);
> +      obj->pipeline_stats.results = NULL;
> +
> +      ++brw->perfquery.n_active_pipeline_stats_queries;
> +      break;
> +   }
> +
> +   return true;
> +}
> +
> +/**
> + * Driver hook for glEndPerfQueryINTEL().
> + */
> +static void
> +brw_end_perf_query(struct gl_context *ctx,
> +                     struct gl_perf_query_object *o)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_perf_query_object *obj = brw_perf_query(o);
> +
> +   DBG("End(%d)\n", o->Id);
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      /* Take ending snapshots. */
> +      snapshot_statistics_registers(brw, obj,
> +                                    SECOND_SNAPSHOT_OFFSET_IN_BYTES);
> +      --brw->perfquery.n_active_pipeline_stats_queries;
> +      break;
> +   }
> +}
> +
> +static void
> +brw_wait_perf_query(struct gl_context *ctx, struct gl_perf_query_object *o)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_perf_query_object *obj = brw_perf_query(o);
> +   drm_intel_bo *bo = NULL;
> +
> +   assert(!o->Ready);
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      bo = obj->pipeline_stats.bo;
> +      break;
> +   }
> +
> +   if (bo == NULL)
> +      return;
> +
> +   /* If the current batch references our results bo then we need to
> +    * flush first... */
> +   if (drm_intel_bo_references(brw->batch.bo, bo))
> +      intel_batchbuffer_flush(brw);
> +
> +   if (unlikely(brw->perf_debug)) {
> +      if (drm_intel_bo_busy(bo))
> +         perf_debug("Stalling GPU waiting for a performance query object.\n");
> +   }
> +
> +   drm_intel_bo_wait_rendering(bo);
> +}
> +
> +/**
> + * Is a performance query result available?
> + */
> +static GLboolean
> +brw_is_perf_query_ready(struct gl_context *ctx,
> +                        struct gl_perf_query_object *o)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_perf_query_object *obj = brw_perf_query(o);
> +
> +   if (o->Ready)
> +      return true;
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      return (obj->pipeline_stats.bo &&
> +              !drm_intel_bo_references(brw->batch.bo, obj->pipeline_stats.bo) &&
> +              !drm_intel_bo_busy(obj->pipeline_stats.bo));
> +   }
> +
> +   unreachable("missing ready check for unknown query kind");
> +   return false;
> +}
> +
> +static int
> +get_pipeline_stats_data(struct brw_context *brw,
> +                        struct brw_perf_query_object *obj,
> +                        size_t data_size,
> +                        uint8_t *data)
> +
> +{
> +   int n_counters = obj->query->n_counters;
> +   uint8_t *p = data;
> +
> +   if (!obj->pipeline_stats.results) {
> +      gather_statistics_results(brw, obj);
> +
> +      /* Check if we did really get the results */
> +      if (!obj->pipeline_stats.results)
> +         return 0;
> +   }
> +
> +   for (int i = 0; i < n_counters; i++) {
> +      *((uint64_t *)p) = obj->pipeline_stats.results[i];
> +      p += 8;
> +   }
> +
> +   return p - data;
> +}
> +
> +/**
> + * Get the performance query result.
> + */
> +static void
> +brw_get_perf_query_data(struct gl_context *ctx,
> +                        struct gl_perf_query_object *o,
> +                        GLsizei data_size,
> +                        GLuint *data,
> +                        GLuint *bytes_written)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   struct brw_perf_query_object *obj = brw_perf_query(o);
> +   int written = 0;
> +
> +   assert(brw_is_perf_query_ready(ctx, o));
> +
> +   DBG("GetData(%d)\n", o->Id);
> +   brw_dump_perf_queries(brw);
> +
> +   /* This hook should only be called when results are available. */
> +   assert(o->Ready);
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      written = get_pipeline_stats_data(brw, obj, data_size, (uint8_t *)data);

Do you check if data != NULL before calling this function?

> +      break;
> +   }
> +
> +   if (bytes_written)
> +      *bytes_written = written;
> +}
> +
> +static struct gl_perf_query_object *
> +brw_new_perf_query_object(struct gl_context *ctx, int query_index)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +   const struct brw_perf_query *query = &brw->perfquery.queries[query_index];
> +   struct brw_perf_query_object *obj =
> +      calloc(1, sizeof(struct brw_perf_query_object));
> +
> +   if (!obj)
> +      return NULL;
> +
> +   obj->query = query;
> +
> +   return &obj->base;
> +}
> +
> +/**
> + * Delete a performance query object.
> + */
> +static void
> +brw_delete_perf_query(struct gl_context *ctx,
> +                      struct gl_perf_query_object *o)
> +{
> +   struct brw_perf_query_object *obj = brw_perf_query(o);
> +
> +   assert(!o->Active);
> +   assert(!o->Used || o->Ready); /* no in-flight query to worry about */
> +
> +   DBG("Delete(%d)\n", o->Id);
> +
> +   switch(obj->query->kind) {
> +   case PIPELINE_STATS:
> +      if (obj->pipeline_stats.bo) {
> +         drm_intel_bo_unreference(obj->pipeline_stats.bo);
> +         obj->pipeline_stats.bo = NULL;
> +      }
> +
> +      free(obj->pipeline_stats.results);
> +      obj->pipeline_stats.results = NULL;
> +      break;
> +   }
> +
> +   free(obj);
> +}
> +
> +#define SCALED_NAMED_STAT(REG, NUM, DEN, NAME, DESC)        \
> +   {                                                        \
> +      .name = NAME,                                         \
> +      .desc = DESC,                                         \
> +      .type = GL_PERFQUERY_COUNTER_RAW_INTEL,               \
> +      .data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL,  \
> +      .size = sizeof(uint64_t),                             \
> +      .pipeline_stat.reg = REG,                             \
> +      .pipeline_stat.numerator = NUM,                       \
> +      .pipeline_stat.denominator = DEN,                     \
> +   }
> +#define NAMED_STAT(REG, NAME, DESC)    SCALED_NAMED_STAT(REG, 1, 1, NAME, DESC)
> +#define STAT(REG, DESC)                SCALED_NAMED_STAT(REG, 1, 1, #REG, DESC)
> +#define SCALED_STAT(REG, N, D, DESC)   SCALED_NAMED_STAT(REG, N, D, #REG, DESC)
> +
> +static struct brw_perf_query_counter gen6_pipeline_statistics[] = {
> +   STAT(IA_VERTICES_COUNT,   "N vertices submitted"),
> +   STAT(IA_PRIMITIVES_COUNT, "N primitives submitted"),
> +   STAT(VS_INVOCATION_COUNT, "N vertex shader invocations"),
> +   STAT(GS_INVOCATION_COUNT, "N geometry shader invocations"),
> +   STAT(GS_PRIMITIVES_COUNT, "N geometry shader primitives emitted"),
> +   STAT(CL_INVOCATION_COUNT, "N primitives entering clipping"),
> +   STAT(CL_PRIMITIVES_COUNT, "N primitives leaving clipping"),
> +   STAT(PS_INVOCATION_COUNT, "N fragment shader invocations"),
> +   STAT(PS_DEPTH_COUNT,      "N z-pass fragments"),
> +
> +   NAMED_STAT(GEN6_SO_PRIM_STORAGE_NEEDED, "SO_PRIM_STORAGE_NEEDED",
> +              "N geometry shader stream-out primitives (total)"),
> +   NAMED_STAT(GEN6_SO_NUM_PRIMS_WRITTEN,   "SO_NUM_PRIMS_WRITTEN",
> +              "N geometry shader stream-out primitives (written)"),
> +};
> +
> +static struct brw_perf_query_counter gen7_pipeline_statistics[] = {
> +
> +   STAT(IA_VERTICES_COUNT,   "N vertices submitted"),
> +   STAT(IA_PRIMITIVES_COUNT, "N primitives submitted"),
> +   STAT(VS_INVOCATION_COUNT, "N vertex shader invocations"),
> +   STAT(HS_INVOCATION_COUNT, "N hull shader invocations"),
> +   STAT(DS_INVOCATION_COUNT, "N domain shader invocations"),
> +   STAT(GS_INVOCATION_COUNT, "N geometry shader invocations"),
> +   STAT(GS_PRIMITIVES_COUNT, "N geometry shader primitives emitted"),
> +   STAT(CL_INVOCATION_COUNT, "N primitives entering clipping"),
> +   STAT(CL_PRIMITIVES_COUNT, "N primitives leaving clipping"),
> +
> +   /* Implement the "WaDividePSInvocationCountBy4:HSW,BDW" workaround:
> +    * "Invocation counter is 4 times actual.  WA: SW to divide HW reported
> +    *  PS Invocations value by 4."
> +    *
> +    * Prior to Haswell, invocation count was counted by the WM, and it
> +    * buggily counted invocations in units of subspans (2x2 unit). To get the
> +    * correct value, the CS multiplied this by 4. With HSW the logic moved,
> +    * and correctly emitted the number of pixel shader invocations, but,
> +    * whomever forgot to undo the multiply by 4.
> +    */
> +   SCALED_STAT(PS_INVOCATION_COUNT, 1, 4, "N fragment shader invocations"),
> +
> +   STAT(PS_DEPTH_COUNT,      "N z-pass fragments"),
> +
> +   NAMED_STAT(GEN7_SO_PRIM_STORAGE_NEEDED(0), "SO_NUM_PRIMS_WRITTEN (Stream 0)",
> +              "N stream-out (stream 0) primitives (total)"),
> +   NAMED_STAT(GEN7_SO_PRIM_STORAGE_NEEDED(1), "SO_NUM_PRIMS_WRITTEN (Stream 1)",
> +              "N stream-out (stream 1) primitives (total)"),
> +   NAMED_STAT(GEN7_SO_PRIM_STORAGE_NEEDED(2), "SO_NUM_PRIMS_WRITTEN (Stream 2)",
> +              "N stream-out (stream 2) primitives (total)"),
> +   NAMED_STAT(GEN7_SO_PRIM_STORAGE_NEEDED(3), "SO_NUM_PRIMS_WRITTEN (Stream 3)",
> +              "N stream-out (stream 3) primitives (total)"),
> +   NAMED_STAT(GEN7_SO_NUM_PRIMS_WRITTEN(0), "SO_NUM_PRIMS_WRITTEN (Stream 0)",
> +              "N stream-out (stream 0) primitives (written)"),
> +   NAMED_STAT(GEN7_SO_NUM_PRIMS_WRITTEN(1), "SO_NUM_PRIMS_WRITTEN (Stream 1)",
> +              "N stream-out (stream 1) primitives (written)"),
> +   NAMED_STAT(GEN7_SO_NUM_PRIMS_WRITTEN(2), "SO_NUM_PRIMS_WRITTEN (Stream 2)",
> +              "N stream-out (stream 2) primitives (written)"),
> +   NAMED_STAT(GEN7_SO_NUM_PRIMS_WRITTEN(3), "SO_NUM_PRIMS_WRITTEN (Stream 3)",
> +              "N stream-out (stream 3) primitives (written)"),
> +};
> +
> +#undef STAT
> +#undef NAMED_STAT
> +
> +static void
> +add_pipeline_statistics_query(struct brw_context *brw,
> +                              const char *name,
> +                              struct brw_perf_query_counter *counters,
> +                              int n_counters)
> +{
> +   struct brw_perf_query *query =
> +      &brw->perfquery.queries[brw->perfquery.n_queries++];
> +
> +   query->kind = PIPELINE_STATS;
> +   query->name = name;
> +   query->data_size = sizeof(uint64_t) * n_counters;
> +   query->n_counters = n_counters;
> +   query->counters = counters;
> +
> +   for (int i = 0; i < n_counters; i++) {
> +      struct brw_perf_query_counter *counter = &counters[i];
> +      counter->offset = sizeof(uint64_t) * i;
> +   }
> +}
> +
> +void
> +brw_init_performance_queries(struct brw_context *brw)
> +{
> +   struct gl_context *ctx = &brw->ctx;
> +
> +   ctx->Driver.GetPerfQueryInfo = brw_get_perf_query_info;
> +   ctx->Driver.GetPerfCounterInfo = brw_get_perf_counter_info;
> +   ctx->Driver.NewPerfQueryObject = brw_new_perf_query_object;
> +   ctx->Driver.DeletePerfQuery = brw_delete_perf_query;
> +   ctx->Driver.BeginPerfQuery = brw_begin_perf_query;
> +   ctx->Driver.EndPerfQuery = brw_end_perf_query;
> +   ctx->Driver.WaitPerfQuery = brw_wait_perf_query;
> +   ctx->Driver.IsPerfQueryReady = brw_is_perf_query_ready;
> +   ctx->Driver.GetPerfQueryData = brw_get_perf_query_data;
> +
> +   if (brw->gen == 6) {
> +      add_pipeline_statistics_query(brw, "Gen6 Pipeline Statistics Registers",
> +                                    gen6_pipeline_statistics,
> +                                    (sizeof(gen6_pipeline_statistics)/
> +                                     sizeof(gen6_pipeline_statistics[0])));
> +   } else if (brw->gen == 7) {
> +      add_pipeline_statistics_query(brw, "Gen7 Pipeline Statistics Registers",
> +                                    gen7_pipeline_statistics,
> +                                    (sizeof(gen7_pipeline_statistics)/
> +                                     sizeof(gen7_pipeline_statistics[0])));
> +   }
> +
> +   ctx->PerfQuery.NumQueries = brw->perfquery.n_queries;
> +}
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
> index 73fb23a..53c4c8f 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -264,6 +264,9 @@ intelInitExtensions(struct gl_context *ctx)
>         ctx->Extensions.ARB_stencil_texturing = true;
>      }
>   
> +   if (brw->gen >= 6 && brw->gen <= 7)
> +      ctx->Extensions.INTEL_performance_query = true;
> +
>      if (ctx->API == API_OPENGL_CORE)
>         ctx->Extensions.ARB_base_instance = true;
>      if (ctx->API != API_OPENGL_CORE)

Well, just some minor things for this one.
I won't review the rest of the series because I'm not an expert of 
performance counters for Intel GPU. :-)