[Mesa-dev] [PATCH v2 5/7] nv50: add support for compute/graphics global performance counters

Wed Jul 22 15:05:11 PDT 2015

On 01/07/15 01:01, Samuel Pitoiset wrote:
> This commit adds support for both compute and graphics global
> performance counters which have been reverse engineered with
> CUPTI (Linux) and PerfKit (Windows).
>
> Currently, only one query type can be monitored at the same time because
> the Gallium's HUD doesn't fit pretty well. This will be improved later.
>
> Changes since v2:
> - replace \% by percentage
> - remove one extra call to PUSH_SPACE
> - use nouveau_fence instead of my hand-made fence mechanism
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> ---
>   src/gallium/drivers/nouveau/nv50/nv50_query.c  | 1066 +++++++++++++++++++++++-
>   src/gallium/drivers/nouveau/nv50/nv50_screen.h |   35 +
>   2 files changed, 1096 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
> index 81f7474..7fb6f3a 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
> @@ -27,6 +27,8 @@
>   #include "nv50/nv50_context.h"
>   #include "nv_object.xml.h"
>   
> +#include "nouveau_perfmon.h"
> +
>   #define NV50_QUERY_STATE_READY   0
>   #define NV50_QUERY_STATE_ACTIVE  1
>   #define NV50_QUERY_STATE_ENDED   2
> @@ -51,10 +53,25 @@ struct nv50_query {
>      boolean is64bit;
>      struct nouveau_mm_allocation *mm;
>      struct nouveau_fence *fence;
> +   struct nouveau_object *perfdom;
>   };
>   
>   #define NV50_QUERY_ALLOC_SPACE 256
>   
> +#ifdef DEBUG
No need to guard the definition of this function. The compiler will get 
rid of it if it has no users.
> +static void nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args);
> +#endif
> +
> +static boolean
> +nv50_hw_pm_query_create(struct nv50_context *, struct nv50_query *);
> +static void
> +nv50_hw_pm_query_destroy(struct nv50_context *, struct nv50_query *);
> +static boolean
> +nv50_hw_pm_query_begin(struct nv50_context *, struct nv50_query *);
> +static void nv50_hw_pm_query_end(struct nv50_context *, struct nv50_query *);
> +static boolean nv50_hw_pm_query_result(struct nv50_context *,
> +                                    struct nv50_query *, boolean, void *);
> +
>   static INLINE struct nv50_query *
>   nv50_query(struct pipe_query *pipe)
>   {
> @@ -96,9 +113,15 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
>   static void
>   nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
>   {
> -   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
> -   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
> -   FREE(nv50_query(pq));
> +   struct nv50_context *nv50 = nv50_context(pipe);
> +   struct nv50_query *q = nv50_query(pq);
> +
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST))
> +      nv50_hw_pm_query_destroy(nv50, q);
> +
> +   nv50_query_allocate(nv50, q, 0);
> +   nouveau_fence_ref(NULL, &q->fence);
> +   FREE(q);
>   }
>   
>   static struct pipe_query *
> @@ -120,6 +143,12 @@ nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
>                    type == PIPE_QUERY_PRIMITIVES_EMITTED ||
>                    type == PIPE_QUERY_SO_STATISTICS ||
>                    type == PIPE_QUERY_PIPELINE_STATISTICS);
> +   if (type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST) {
> +      /* Hardware global performance counters are not 64 bits, but we also use
> +       * a fence to make sure the query is ready. */

I do not understand the logic of this comment.
> +      q->is64bit = TRUE;
> +   }
> +
>      q->type = type;
>   
>      if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
> @@ -127,6 +156,11 @@ nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
>         q->data -= 32 / sizeof(*q->data); /* we advance before query_begin ! */
>      }
>   
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +      if (!nv50_hw_pm_query_create(nv50, q))
> +         return NULL;
> +   }
> +
>      return (struct pipe_query *)q;
>   }
>   
> @@ -151,6 +185,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
>      struct nv50_context *nv50 = nv50_context(pipe);
>      struct nouveau_pushbuf *push = nv50->base.pushbuf;
>      struct nv50_query *q = nv50_query(pq);
> +   boolean ret = TRUE;
>   
>      /* For occlusion queries we have to change the storage, because a previous
>       * query might set the initial render conition to FALSE even *after* we re-
> @@ -205,10 +240,13 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
>         nv50_query_get(push, q, 0x10, 0x00005002);
>         break;
>      default:
> +      if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +         ret = nv50_hw_pm_query_begin(nv50, q);
> +      }
>         break;
>      }
>      q->state = NV50_QUERY_STATE_ACTIVE;
> -   return true;
> +   return ret;
>   }
>   
>   static void
> @@ -265,7 +303,9 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
>         q->state = NV50_QUERY_STATE_READY;
>         break;
>      default:
> -      assert(0);
> +      if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +         nv50_hw_pm_query_end(nv50, q);
> +      }
I get the idea, but deleting assert(0) is not acceptable. Why don't you 
move it to after your if and add a break at the end of the if block? 
This way, you preserve the old behaviour :)
>         break;
>      }
>   
> @@ -300,6 +340,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
>      if (q->state != NV50_QUERY_STATE_READY)
>         nv50_query_update(q);
>   
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +      return nv50_hw_pm_query_result(nv50, q, wait, result);
> +   }
> +
>      if (q->state != NV50_QUERY_STATE_READY) {
>         if (!wait) {
>            /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
> @@ -476,6 +520,1018 @@ nva0_so_target_save_offset(struct pipe_context *pipe,
>      nv50_query_end(pipe, targ->pq);
>   }
>   
> +/* === HARDWARE GLOBAL PERFORMANCE COUNTERS for NV50 === */
> +
> +struct nv50_hw_pm_source_cfg
> +{
> +   const char *name;
> +   uint64_t value;
> +};
> +
> +struct nv50_hw_pm_signal_cfg
> +{
> +   const char *name;
> +   const struct nv50_hw_pm_source_cfg src[8];
> +};
> +
> +struct nv50_hw_pm_counter_cfg
> +{
> +   uint16_t logic_op;
> +   const struct nv50_hw_pm_signal_cfg sig[4];
> +};
> +
> +enum nv50_hw_pm_query_display
> +{
> +   NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +};
> +
> +enum nv50_hw_pm_query_count
> +{
> +   NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   NV50_HW_PM_EVENT_COUNT_B4,
> +   NV50_HW_PM_EVENT_COUNT_B6,
> +};
> +
> +struct nv50_hw_pm_event_cfg
> +{
> +   const char *name;
> +   const char *desc;
> +   enum nv50_hw_pm_query_display display;
> +   enum nv50_hw_pm_query_count count;
> +   uint8_t domain;
> +};
> +
> +struct nv50_hw_pm_query_cfg
> +{
> +   const struct nv50_hw_pm_event_cfg *event;
> +   const struct nv50_hw_pm_counter_cfg ctr[4];
> +};
> +
> +#define SRC(name, val) { name, val }
> +#define SIG(name, ...) { name, { __VA_ARGS__ } }
> +#define CTR(func, ...) { func, { __VA_ARGS__ } }
> +
> +/*
> + * GPU
> + */
> +/* gpu_idle */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_gpu_idle_event =
> +{
> +   .name    = "gpu_idle",
> +   .desc    = "The percentage of time the GPU is idle/busy since the last "
> +              "call. Having the GPU idle at all is a waste of valuable "
> +              "resources. You want to balance the GPU and CPU workloads so "
> +              "that no one processor is starved for work. Time management or "
> +              "using multithreading in your application can help balance CPU "
> +              "based tasks (world management, etc.) with the rendering "
> +              "pipeline.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_gpu_idle_query =
> +{
> +   .event  = &nv50_gpu_idle_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_gr_idle")),
> +};
> +
> +/*
> + * INPUT ASSEMBLER
> + */
> +/* input_assembler_busy */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_ia_busy_event =
> +{
> +   .name    = "input_assembler_busy",
> +   .desc    = "The percentage of time the input assembler unit is busy. This "
> +              "is mainly impacted by both the number of vertices processed as "
> +              "well as the size of the attributes on those vertices. You can "
> +              "optimize this by reducing vertex size as much as possible and "
> +              "using indexed primitives to take advantage of the vertex cache.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_ia_busy_query =
> +{
> +   .event   = &nv50_ia_busy_event,
> +   .ctr[0]  = CTR(0xf888, SIG("pc01_vfetch_18",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                          SIG("pc01_vfetch_17"),
> +                          SIG("pc01_vfetch_03"),
> +                          SIG("pc01_vfetch_02")),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nva0_ia_busy_query =
> +{
> +   .event   = &nv50_ia_busy_event,
> +   .ctr[0]  = CTR(0xf888, SIG("pc01_vfetch_15",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                          SIG("pc01_vfetch_14"),
> +                          SIG("pc01_vfetch_03"),
> +                          SIG("pc01_vfetch_02")),
> +};
> +
> +/* input_assembler_waits_for_fb */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_ia_waits_for_fb_event = {
> +   .name    = "input_assembler_waits_for_fb",
> +   .desc    = "This is the amount of time the input assembler unit was "
> +              "waiting for data from the frame buffer unit.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_ia_waits_for_fb_query =
> +{
> +   .event   = &nv50_ia_waits_for_fb_event,
> +   .ctr[0]  = CTR(0xaaaa, SIG("pc01_vfetch_0e",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nva0_ia_waits_for_fb_query =
> +{
> +   .event   = &nv50_ia_waits_for_fb_event,
> +   .ctr[0]  = CTR(0xaaaa, SIG("pc01_vfetch_0b",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
> +};
> +
> +/* vertex_attribute_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_vertex_attr_count_event =
> +{
> +   .name    = "vertex_attribute_count",
> +   .desc    = "The number of vertex attributes that are fetched and passed to "
> +              "the geometry unit is returned in this counter. A large number "
> +              "of attributes (or unaligned vertices) can hurt vertex cache "
> +              "performance and reduce the overall vertex processing "
> +              "capabilities of the pipeline.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_vertex_attr_count_query =
> +{
> +   .event = &nv50_vertex_attr_count_event,
> +   .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_18",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                         SIG("pc01_vfetch_17"),
> +                         SIG("pc01_vfetch_03"),
> +                         SIG("pc01_vfetch_02")),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nva0_vertex_attr_count_query =
> +{
> +   .event  = &nv50_vertex_attr_count_event,
> +   .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_15",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                         SIG("pc01_vfetch_14"),
> +                         SIG("pc01_vfetch_03"),
> +                         SIG("pc01_vfetch_02")),
> +};
> +
> +/*
> + * GEOM
> + */
> +/* geom_vertex_in_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_vertex_in_count_event =
> +{
> +   .name    = "geom_vertex_in_count",
> +   .desc    = "The number of vertices input to the geom unit.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B4,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_vertex_in_count_query =
> +{
> +   .event  = &nv50_geom_vertex_in_count_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc01_vfetch_0e",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x0)),
> +                         SIG("pc01_vfetch_0f"),
> +                         SIG("pc01_vfetch_10"),
> +                         SIG("pc01_trailer")),
> +   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_trailer")),
> +};
> +
> +/* geom_vertex_out_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_vertex_out_count_event =
> +{
> +   .name    = "geom_vertex_out_count",
> +   .desc    = "The number of vertices coming out of the geom unit after any "
> +              "geometry shader expansion.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_vertex_out_count_query =
> +{
> +   .event  = &nv50_geom_vertex_out_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_01")),
> +};
> +
> +/* geom_primitive_in_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_primitive_in_count_event =
> +{
> +   .name    = "geom_primitive_in_count",
> +   .desc    = "The number of primitives input to the geom unit.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_primitive_in_count_query =
> +{
> +   .event  = &nv50_geom_primitive_in_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_vfetch_08",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x0))),
> +};
> +
> +/* geom_primitive_out_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_primitive_out_count_event =
> +{
> +   .name    = "geom_primitive_out_count",
> +   .desc    = "The number of primitives coming out the geom unit after any "
> +              "geometry shader expansion.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_primitive_out_count_query =
> +{
> +   .event  = &nv50_geom_primitive_out_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_00")),
> +};
> +
> +/*
> + * STREAM OUT
> + */
> +/* stream_out_busy */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_so_busy_event =
> +{
> +   .name    = "stream_out_busy",
> +   .desc    = "This unit manages the writing of vertices to the frame buffer "
> +              "when using stream out. If a significant number of vertices are "
> +              "written, this can become a bottleneck.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_so_busy_query =
> +{
> +   .event  = &nv50_so_busy_event,
> +   .ctr[0] = CTR(0x8888, SIG("pc01_strmout_00"),
> +                         SIG("pc01_strmout_01")),
> +};
> +
> +/*
> + * SETUP
> + */
> +/* setup_primitive_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_primitive_count_event =
> +{
> +   .name    = "setup_primitive_count",
> +   .desc    = "Returns the number of primitives processed in the geometry "
> +              "subsystem. This experiments counts points, lines and triangles. "
> +              "To count only triangles, use the setup_triangle_count counter. "
> +              "Balance these counts with the number of pixels being drawn to "
> +              "see if you could simplify your geometry and use "
> +              "bump/displacement maps, for example.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_primitive_count_query =
> +{
> +   .event  = &nv50_setup_primitive_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_trast_00")),
> +};
> +
> +/* setup_point_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_point_count_event =
> +{
> +   .name    = "setup_point_count",
> +   .desc    = "The number of points seen by the primitive setup unit (just "
> +              "before rasterization).",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_point_count_query =
> +{
> +   .event  = &nv50_setup_point_count_event,
> +   .ctr[0] = CTR(0x8080, SIG("pc01_trast_01"),
> +                         SIG("pc01_trast_04"),
> +                         SIG("pc01_trast_05")),
> +};
> +
> +/* setup_line_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_line_count_event =
> +{
> +   .name    = "setup_line_count",
> +   .desc    = "The number of lines seen by the primitive setup unit (just "
> +              "before rasterization).",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_line_count_query =
> +{
> +   .event  = &nv50_setup_line_count_event,
> +   .ctr[0] = CTR(0x8080, SIG("pc01_trast_02"),
> +                         SIG("pc01_trast_04"),
> +                         SIG("pc01_trast_05")),
> +};
> +
> +/* setup_triangle_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_triangle_count_event =
> +{
> +   .name    = "setup_triangle_count",
> +   .desc    = "Returns the number of triangles processed in the geometry "
> +              "subsystem.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_triangle_count_query =
> +{
> +   .event  = &nv50_setup_triangle_count_event,
> +   .ctr[0] = CTR(0x8080, SIG("pc01_trast_03"),
> +                         SIG("pc01_trast_04"),
> +                         SIG("pc01_trast_05")),
> +};
> +
> +/* setup_primitive_culled_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_primitive_culled_count_event =
> +{
> +   .name    = "setup_primitive_culled_count",
> +   .desc    = "Returns the number of primitives culled in primitive setup. If "
> +              "you are performing viewport culling, this gives you an "
> +              "indication of the accuracy of the algorithm being used, and can "
> +              "give you and idea if you need to improves this culling. This "
> +              "includes primitives culled when using backface culling. Drawing "
> +              "a fully visible sphere on the screen should cull half of the "
> +              "triangles if backface culling is turned on and all the "
> +              "triangles are ordered consistently (CW or CCW).",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_primitive_culled_count_query =
> +{
> +   .event  = &nv50_setup_primitive_culled_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_unk00")),
> +};
> +
> +/*
> + * RASTERIZER
> + */
> +/* rast_tiles_killed_by_zcull_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rast_tiles_killed_by_zcull_event =
> +{
> +   .name    = "rasterizer_tiles_killed_by_zcull_count",
> +   .desc    = "The number of pixels killed by the zcull unit in the rasterizer.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rast_tiles_killed_by_zcull_query =
> +{
> +   .event  = &nv50_rast_tiles_killed_by_zcull_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
> +                             SRC("pgraph_zcull_pm_unka4_unk0", 0x7)),
> +                         SIG("pc01_zcull_01"),
> +                         SIG("pc01_zcull_02"),
> +                         SIG("pc01_zcull_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_zcull_04"),
> +                         SIG("pc01_zcull_05")),
> +};
> +
> +/* rast_tiles_in_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rast_tiles_in_count_event =
> +{
> +   .name    = "rasterizer_tiles_in_count",
> +   .desc    = "Count of tiles (each of which contain 1-8 pixels) seen by the "
> +              "rasterizer stage.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rast_tiles_in_count_query =
> +{
> +   .event  = &nv50_rast_tiles_in_count_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
> +                             SRC("pgraph_zcull_pm_unka4_unk0", 0x0)),
> +                         SIG("pc01_zcull_01"),
> +                         SIG("pc01_zcull_02"),
> +                         SIG("pc01_zcull_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_zcull_04"),
> +                         SIG("pc01_zcull_05")),
> +};
> +
> +/*
> + * ROP
> + */
> +/* rop_busy */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_busy_event =
> +{
> +   .name    = "rop_busy",
> +   .desc    = "Percentage of time that the ROP unit is actively doing work. "
> +              "This can be high if alpha blending is turned on, of overdraw "
> +              "is high, etc.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_busy_query =
> +{
> +   .event  = &nv50_rop_busy_event,
> +   .ctr[0] = CTR(0xf888, SIG("pc02_prop_02",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
> +                         SIG("pc02_prop_03"),
> +                         SIG("pc02_prop_04"),
> +                         SIG("pc02_prop_05")),
> +};
> +
> +/* rop_waits_for_fb */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_waits_for_fb_event =
> +{
> +   .name    = "rop_waits_for_fb",
> +   .desc    = "The amount of time the blending unit spent waiting for data "
> +              "from the frame buffer unit. If blending is enabled and there "
> +              "is a lot of traffic here (since this is a read/modify/write "
> +              "operation) this can become a bottleneck.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_waits_for_fb_query =
> +{
> +   .event  = &nv50_rop_waits_for_fb_event,
> +   .ctr[0] = CTR(0x22f2, SIG("pc02_crop_03",
> +                             SRC("pgraph_rop0_crop_pm_mux_sel0", 0x0)),
> +                         SIG("pc02_crop_02"),
> +                         SIG("pc02_zrop_03",
> +                             SRC("pgraph_rop0_zrop_pm_mux_sel0", 0x0)),
> +                         SIG("pc02_zrop_02")),
> +};
> +
> +/* rop_waits_for_shader */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_waits_for_shader_event =
> +{
> +   .name    = "rop_waits_for_shader",
> +   .desc    = "This is a measurement of how often the blending unit was "
> +              "waiting on new work (fragments to be placed into the render "
> +              "target). If the pixel shaders are particularly expensive, the "
> +              "ROP unit could be starved waiting for results.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_waits_for_shader_query =
> +{
> +   .event  = &nv50_rop_waits_for_shader_event,
> +   .ctr[0] = CTR(0x2222, SIG("pc02_prop_6",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
> +                         SIG("pc02_prop_7")),
> +};
> +
> +/* rop_samples_killed_by_earlyz_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_samples_killed_by_earlyz_event =
> +{
> +   .name    = "rop_samples_killed_by_earlyz_count",
> +   .desc    = "This returns the number of pixels that were killed in the "
> +              "earlyZ hardware. This signal will give you an idea of, for "
> +              "instance, a Z only pass was successful in setting up the depth "
> +              "buffer.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_samples_killed_by_earlyz_query =
> +{
> +   .event  = &nv50_rop_samples_killed_by_earlyz_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1a)),
> +                         SIG("pc02_prop_01"),
> +                         SIG("pc02_prop_02"),
> +                         SIG("pc02_prop_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
> +                         SIG("pc02_trailer"),
> +                         SIG("pc02_prop_04"),
> +                         SIG("pc02_prop_05")),
> +};
> +
> +/* rop_samples_killed_by_latez_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_samples_killed_by_latez_event =
> +{
> +   .name    = "rop_samples_killed_by_latez_count",
> +   .desc    = "This returns the number of pixels that were killed after the "
> +              "pixel shader ran. This can happen if the early Z is unable to "
> +              "cull the pixel because of an API setup issue like changing the "
> +              "Z direction or modifying Z in the pixel shader.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_samples_killed_by_latez_query =
> +{
> +   .event  = &nv50_rop_samples_killed_by_latez_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1b)),
> +                         SIG("pc02_prop_01"),
> +                         SIG("pc02_prop_02"),
> +                         SIG("pc02_prop_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
> +                         SIG("pc02_trailer"),
> +                         SIG("pc02_prop_04"),
> +                         SIG("pc02_prop_05")),
> +};
> +
> +/*
> + * TEXTURE
> + */
> +/* tex_cache_miss */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_tex_cache_miss_event =
> +{
> +   .name    = "tex_cache_miss",
> +   .desc    = "Number of texture cache misses.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_tex_cache_miss_query =
> +{
> +   .event  = &nv50_tex_cache_miss_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x200))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv84_tex_cache_miss_query =
> +{
> +   .event  = &nv50_tex_cache_miss_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x800))),
> +};
> +
> +/* tex_cache_hit */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_tex_cache_hit_event =
> +{
> +   .name    = "tex_cache_hit",
> +   .desc    = "Number of texture cache hits.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_tex_cache_hit_query =
> +{
> +   .event  = &nv50_tex_cache_hit_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x200))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv84_tex_cache_hit_query =
> +{
> +   .event  = &nv50_tex_cache_hit_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x800))),
> +};
> +
> +/* tex_waits_for_fb */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_tex_waits_for_fb_event =
> +{
> +   .name    = "tex_waits_for_fb",
> +   .desc    = "This is the amount of time the texture unit spent waiting on "
> +              "samples to return from the frame buffer unit. It is a potential "
> +              "indication of poor texture cache utilization.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_tex_waits_for_fb_query =
> +{
> +   .event  = &nv50_tex_waits_for_fb_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x200))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv84_tex_waits_for_fb_query =
> +{
> +   .event  = &nv50_tex_waits_for_fb_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x800))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg *nv50_hw_pm_queries[NV50_HW_PM_QUERY_COUNT];
> +
> +#define _Q(n, q) nv50_hw_pm_queries[NV50_HW_PM_QUERY_##n] = &q;
> +
> +static void
> +nv50_identify_events(struct nv50_screen *screen)
> +{
> +  _Q(GPU_IDLE,                      nv50_gpu_idle_query);
> +  _Q(IA_BUSY,                       nv50_ia_busy_query);
> +  _Q(IA_WAITS_FOR_FB,               nv50_ia_waits_for_fb_query);
> +  _Q(VERTEX_ATTR_COUNT,             nv50_vertex_attr_count_query);
> +  _Q(GEOM_VERTEX_IN_COUNT,          nv50_geom_vertex_in_count_query);
> +  _Q(GEOM_VERTEX_OUT_COUNT,         nv50_geom_vertex_out_count_query);
> +  _Q(GEOM_PRIMITIVE_IN_COUNT,       nv50_geom_primitive_in_count_query);
> +  _Q(GEOM_PRIMITIVE_OUT_COUNT,      nv50_geom_primitive_out_count_query);
> +  _Q(SO_BUSY,                       nv50_so_busy_query);
> +  _Q(SETUP_PRIMITIVE_COUNT,         nv50_setup_primitive_count_query);
> +  _Q(SETUP_POINT_COUNT,             nv50_setup_point_count_query);
> +  _Q(SETUP_LINE_COUNT,              nv50_setup_line_count_query);
> +  _Q(SETUP_TRIANGLE_COUNT,          nv50_setup_triangle_count_query);
> +  _Q(SETUP_PRIMITIVE_CULLED_COUNT,  nv50_setup_primitive_culled_count_query);
> +  _Q(RAST_TILES_KILLED_BY_ZCULL,    nv50_rast_tiles_killed_by_zcull_query);
> +  _Q(RAST_TILES_IN_COUNT,           nv50_rast_tiles_in_count_query);
> +  _Q(ROP_BUSY,                      nv50_rop_busy_query);
> +  _Q(ROP_WAITS_FOR_FB,              nv50_rop_waits_for_fb_query);
> +  _Q(ROP_WAITS_FOR_SHADER,          nv50_rop_waits_for_shader_query);
> +  _Q(ROP_SAMPLES_KILLED_BY_EARLYZ,  nv50_rop_samples_killed_by_earlyz_query);
> +  _Q(ROP_SAMPLES_KILLED_BY_LATEZ,   nv50_rop_samples_killed_by_latez_query );
> +  _Q(TEX_CACHE_MISS,                nv50_tex_cache_miss_query);
> +  _Q(TEX_CACHE_HIT,                 nv50_tex_cache_hit_query);
> +  _Q(TEX_WAITS_FOR_FB,              nv50_tex_waits_for_fb_query);
> +
> +   if (screen->base.class_3d >= NV84_3D_CLASS) {
> +      /* Variants for NV84+ */
> +      _Q(TEX_CACHE_MISS,   nv84_tex_cache_miss_query);
> +      _Q(TEX_CACHE_HIT,    nv84_tex_cache_hit_query);
> +      _Q(TEX_WAITS_FOR_FB, nv84_tex_waits_for_fb_query);
> +   }
> +
> +   if (screen->base.class_3d >= NVA0_3D_CLASS) {
> +      /* Variants for NVA0+ */
> +      _Q(IA_BUSY,           nva0_ia_busy_query);
> +      _Q(IA_WAITS_FOR_FB,   nva0_ia_waits_for_fb_query);
> +      _Q(VERTEX_ATTR_COUNT, nva0_vertex_attr_count_query);
> +   }
> +}
> +
> +#undef _Q
> +
> +#ifdef DEBUG
Same as above, get rid of this test.
> +static void
> +nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args)
> +{
> +   int i, j, k;
> +
> +   debug_printf("PERFDOM CONFIGURATION:\n");
> +   debug_printf("domaine: 0x%02x\n", args->domain);
> +   debug_printf("mode: 0x%02x\n", args->mode);
> +   for (i = 0; i < 4; i++) {
> +      uint32_t signal = 0;
> +      for (j = 0; j < 4; j++)
> +         signal |= args->ctr[i].signal[j] << (j * 8);
> +
> +      debug_printf("ctr[%d]: func = 0x%04x, signal=0x%08x\n",
> +                   i, args->ctr[i].logic_op, signal);
> +
> +      for (j = 0; j < 4; j++) {
> +         for (k = 0; k < 8; k++) {
> +            uint32_t source, value;
> +            if (!args->ctr[i].source[j][k])
> +               continue;
> +
> +            source = args->ctr[i].source[j][k];
> +            value  = args->ctr[i].source[j][k] >> 32;
> +            debug_printf("  src[%d][%d]: source = 0x%08x, value = 0x%08x\n",
> +                         j, k, source, value);
> +         }
> +      }
> +   }
> +}
> +#endif
> +
> +static const struct nv50_hw_pm_query_cfg *
> +nv50_hw_pm_query_get_cfg(struct nv50_screen *screen, uint32_t query_type)
> +{
> +   return nv50_hw_pm_queries[query_type - NV50_HW_PM_QUERY(0)];
No check that you have a valid query_type? As in, query_type - 
NV50_HW_PM_QUERY(0) <= NV50_HW_PM_QUERY_LAST).
> +}
> +
> +static boolean
> +nv50_hw_pm_query_create(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nv50_screen *screen = nv50->screen;
> +   struct nouveau_perfmon *perfmon = screen->base.perfmon;
> +   static const struct nv50_hw_pm_query_cfg *cfg;
> +   struct nvif_perfdom_v0 args = {};
> +   struct nouveau_perfmon_dom *dom;
> +   int i, j, k;
> +   int ret;
> +
> +   if (!screen->pm.num_active) {
> +      /* TODO: Currently, only one query type can be monitored simultaneously
> +       * because the Gallium's HUD doesn't fit well with the perfdom interface.
> +       *
> +       * With two different query types, the current scenario is as follows:
> +       * CREATE Q1, BEGIN Q1, CREATE Q2, BEGIN Q2, END Q1, RESULT Q1, BEGIN Q1,
> +       * END Q2, RESULT Q2, BEGIN Q2, END Q1, and so on.
> +       *
> +       * This behaviour doesn't allow to schedule multiple counters because
> +       * we have to do that at query creation (ie. when a perfdom is created).
> +       *
> +       * To get rid of this limitation, a better scenario would be:
> +       * CREATE Q1, CREATE Q2, BEGIN Q1, BEGIN Q2, END Q1, END Q2, RESULT Q1,
> +       * RESULT Q2, BEGIN Q1, BEGIN Q2, END Q1, and so on.
> +       *
> +       * With this kind of behaviour, we could introduce
> +       * {create,begin,end}_all_queries() functions to be able to configure
> +       * all queries in one shot.
> +       */
> +      screen->pm.query_type = q->type;
> +   }
> +   screen->pm.num_active++;
> +
> +   if (screen->pm.query_type != q->type) {
> +      NOUVEAU_ERR("Only one query type can be monitored at the same time!");
> +      return FALSE;
> +   }
> +
> +   cfg = nv50_hw_pm_query_get_cfg(nv50->screen, q->type);
> +
> +   dom = nouveau_perfmon_get_dom_by_id(perfmon, cfg->event->domain);
> +   if (!dom) {
> +      NOUVEAU_ERR("Failed to find domain %d\n", cfg->event->domain);
> +      return FALSE;
> +   }
> +
> +   /* configure domain and counting mode */
> +   args.domain = dom->id;
> +   args.mode   = cfg->event->count;
> +
> +   /* configure counters for this hardware event */
> +   for (i = 0; i < ARRAY_SIZE(cfg->ctr); i++) {
> +      const struct nv50_hw_pm_counter_cfg *sctr = &cfg->ctr[i];
> +
> +      if (!sctr->logic_op)
> +         continue;
> +      args.ctr[i].logic_op = sctr->logic_op;
> +
> +      /* configure signals for this counter */
> +      for (j = 0; j < ARRAY_SIZE(sctr->sig); j++) {
> +         const struct nv50_hw_pm_signal_cfg *ssig = &sctr->sig[j];
> +         struct nouveau_perfmon_sig *sig;
> +
> +         if (!ssig->name)
> +            continue;
> +
> +         sig = nouveau_perfmon_get_sig_by_name(dom, ssig->name);
> +         if (!sig) {
> +            NOUVEAU_ERR("Failed to find signal %s\n", ssig->name);
> +            return FALSE;
> +         }
> +         args.ctr[i].signal[j] = sig->signal;
> +
> +         /* configure sources for this signal */
> +         for (k = 0; k < ARRAY_SIZE(ssig->src); k++) {
> +            const struct nv50_hw_pm_source_cfg *ssrc = &ssig->src[k];
> +            struct nouveau_perfmon_src *src;
> +
> +            if (!ssrc->name)
> +               continue;
> +
> +            src = nouveau_perfmon_get_src_by_name(sig, ssrc->name);
> +            if (!src) {
> +               NOUVEAU_ERR("Failed to find source %s\n", ssrc->name);
> +               return FALSE;
> +            }
> +            args.ctr[i].source[j][k] = (ssrc->value << 32) | src->id;
> +         }
> +      }
> +   }
> +
> +#ifdef DEBUG
> +   if (debug_get_num_option("NV50_PM_DEBUG", 0))
> +      nv50_hw_pm_dump_perfdom(&args);
> +#endif
> +
> +   ret = nouveau_object_new(perfmon->object, perfmon->handle++,
> +                            NVIF_IOCTL_NEW_V0_PERFDOM,
> +                            &args, sizeof(args), &q->perfdom);
> +   if (ret) {
> +      NOUVEAU_ERR("Failed to create perfdom object: %d\n", ret);
> +      return FALSE;
> +   }
> +
> +   return TRUE;
> +}
> +
> +static void
> +nv50_hw_pm_query_destroy(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nv50_screen *screen = nv50->screen;
> +
> +   nouveau_object_del(&q->perfdom);
> +   screen->pm.num_active--;
> +}
> +
> +static boolean
> +nv50_hw_pm_query_begin(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nouveau_pushbuf *push = nv50->base.pushbuf;
> +
> +   /* start the next batch of counters */
> +   PUSH_SPACE(push, 2);
> +   BEGIN_NV04(push, SUBC_SW(0x0608), 1);
Put this sw method in libdrm?
> +   PUSH_DATA (push, q->perfdom->handle);
> +
> +   return TRUE;
> +}
> +
> +static void
> +nv50_hw_pm_query_end(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nouveau_pushbuf *push = nv50->base.pushbuf;
> +   struct nv50_screen *screen = nv50->screen;
> +
> +   /* set sequence field (used to check if result is available) */
> +   q->sequence = ++screen->pm.sequence;
> +
> +   /* sample the previous batch of counters */
> +   PUSH_SPACE(push, 4);
> +   BEGIN_NV04(push, SUBC_SW(0x060c), 1);
> +   PUSH_DATA (push, q->perfdom->handle);
> +
> +   /* read back counters values */
> +   BEGIN_NV04(push, SUBC_SW(0x0700), 1);
> +   PUSH_DATA (push, screen->pm.sequence);
> +}
> +
> +static volatile void *
> +nv50_ntfy(struct nv50_screen *screen)
> +{
> +   struct nv04_notify *query = screen->query->data;
> +   struct nouveau_bo *notify = screen->notify_bo;
> +
> +   return (char *)notify->map + query->offset;
> +}
> +
> +static INLINE uint32_t
> +nv50_hw_pm_query_get_offset(struct nv50_query *q)
> +{
> +   return (1 + (q->sequence % NV50_HW_PM_RING_BUFFER_MAX_QUERIES) *
> +           NV50_HW_PM_RING_BUFFER_NUM_DOMAINS * 6);
> +}
> +
> +static INLINE boolean
> +nv50_hw_pm_query_read_data(struct nv50_context *nv50, struct nv50_query *q,
> +                           boolean wait, uint32_t ctr[4], uint32_t *clk)
> +{
> +   volatile uint32_t *ntfy = nv50_ntfy(nv50->screen);
> +   uint32_t offset = nv50_hw_pm_query_get_offset(q);
> +   boolean found = FALSE;
> +   int i;
> +
> +   if (q->state != NV50_QUERY_STATE_READY) {
> +      if (!wait)
> +         return FALSE;
> +      if (!nouveau_fence_wait(q->fence))
> +         return FALSE;
> +   }
> +
> +   if (ntfy[0] > q->sequence + NV50_HW_PM_RING_BUFFER_MAX_QUERIES - 1) {
> +      /* Results in the ring buffer are too old, throw away that query. */
> +      return FALSE;
> +   }
> +
> +   for (i = 0; i < NV50_HW_PM_RING_BUFFER_NUM_DOMAINS; i++) {
> +      if (ntfy[offset + i * 6] == q->perfdom->handle) {
> +         found = TRUE;
> +         break;
> +      }
> +   }
> +
> +   if (!found) {
> +      NOUVEAU_ERR("Failed to find perfdom object %" PRIu64 "!\n",
> +                  q->perfdom->handle);
> +      return FALSE;
> +   }
> +
> +   for (i = 0; i < 4; i++)
> +      ctr[i] = ntfy[offset + i + 1];
> +   *clk = ntfy[offset + 5];
> +
> +   return TRUE;
> +}
> +
> +static boolean
> +nv50_hw_pm_query_result(struct nv50_context *nv50, struct nv50_query *q,
> +                        boolean wait, void *result)
> +{
> +   struct nv50_screen *screen = nv50->screen;
> +   const struct nv50_hw_pm_query_cfg *cfg;
> +   uint32_t ctr[4], clk;
> +   uint64_t value = 0;
> +   int ret;
> +
> +   ret = nv50_hw_pm_query_read_data(nv50, q, wait, ctr, &clk);
> +   if (!ret)
> +      return FALSE;
> +
> +   cfg = nv50_hw_pm_query_get_cfg(screen, q->type);
> +   if (cfg->event->count == NV50_HW_PM_EVENT_COUNT_SIMPLE) {
> +      /* SIMPLE hardware events are sampled on PRE_CTR. */
> +      value = ctr[0];
> +   } else {
> +      /* EVENT_B4/EVENT_B6 hardware events are sampled on EVENT_CTR. */
> +      value = ctr[2];
> +   }
> +
> +   if (cfg->event->display == NV50_HW_PM_EVENT_DISPLAY_RATIO) {
> +      if (clk)
> +         value = (value * 100) / (float)clk;
> +   }
> +
> +   fprintf(stderr, "ctr[0]=%d, ctr[1]=%d, ctr[2]=%d, ctr[3]=%d, clk=%d, val=%d\n",
> +           ctr[0], ctr[1], ctr[2], ctr[3], clk, value);
The above is likely to be a leftover, right? :p
> +
> +   *(uint64_t *)result = value;
> +   return TRUE;
> +}
> +
>   void
>   nv50_init_query_functions(struct nv50_context *nv50)
>   {
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> index 71a5247..0449659 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> @@ -89,6 +89,12 @@ struct nv50_screen {
>         struct nouveau_bo *bo;
>      } fence;
>   
> +   struct {
> +      uint32_t sequence;
> +      uint32_t query_type;
> +      uint32_t num_active;
> +   } pm;
> +
>      struct nouveau_object *sync;
>      struct nouveau_object *query;
>   
> @@ -108,6 +114,35 @@ nv50_screen(struct pipe_screen *screen)
>      return (struct nv50_screen *)screen;
>   }
>   
> +/* Hardware global performance counters. */
> +#define NV50_HW_PM_QUERY_COUNT  24
> +#define NV50_HW_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
> +#define NV50_HW_PM_QUERY_LAST   NV50_HW_PM_QUERY(NV50_HW_PM_QUERY_COUNT - 1)
> +#define NV50_HW_PM_QUERY_GPU_IDLE                            0
> +#define NV50_HW_PM_QUERY_IA_BUSY                             1
> +#define NV50_HW_PM_QUERY_IA_WAITS_FOR_FB                     2
> +#define NV50_HW_PM_QUERY_VERTEX_ATTR_COUNT                   3
> +#define NV50_HW_PM_QUERY_GEOM_VERTEX_IN_COUNT                4
> +#define NV50_HW_PM_QUERY_GEOM_VERTEX_OUT_COUNT               5
> +#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_IN_COUNT             6
> +#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_OUT_COUNT            7
> +#define NV50_HW_PM_QUERY_SO_BUSY                             8
> +#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_COUNT               9
> +#define NV50_HW_PM_QUERY_SETUP_POINT_COUNT                  10
> +#define NV50_HW_PM_QUERY_SETUP_LINE_COUNT                   11
> +#define NV50_HW_PM_QUERY_SETUP_TRIANGLE_COUNT               12
> +#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_CULLED_COUNT       13
> +#define NV50_HW_PM_QUERY_RAST_TILES_KILLED_BY_ZCULL         14
> +#define NV50_HW_PM_QUERY_RAST_TILES_IN_COUNT                15
> +#define NV50_HW_PM_QUERY_ROP_BUSY                           16
> +#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_FB                   17
> +#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_SHADER               18
> +#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_EARLYZ       19
> +#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_LATEZ        20
> +#define NV50_HW_PM_QUERY_TEX_CACHE_MISS                     21
> +#define NV50_HW_PM_QUERY_TEX_CACHE_HIT                      22
> +#define NV50_HW_PM_QUERY_TEX_WAITS_FOR_FB                   23
> +
>   boolean nv50_blitter_create(struct nv50_screen *);
>   void nv50_blitter_destroy(struct nv50_screen *);
>   
Congrats, it looks really clean! With the above fixed, this patch is
Reviewed-by: Martin Peres <martin.peres at free.fr>