[Nouveau] [RFC PATCH 6/8] nv50: add support for compute/graphics global performance counters

Thu Jun 25 16:09:43 PDT 2015

What's with the \%'s everywhere?

On Mon, Jun 22, 2015 at 4:53 PM, Samuel Pitoiset
<samuel.pitoiset at gmail.com> wrote:
> This commit adds support for both compute and graphics global
> performance counters which have been reverse engineered with
> CUPTI (Linux) and PerfKit (Windows).
>
> Currently, only one query type can be monitored at the same time because
> the Gallium's HUD doesn't fit pretty well. This will be improved later.
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> ---
>  src/gallium/drivers/nouveau/nv50/nv50_query.c  | 1057 +++++++++++++++++++++++-
>  src/gallium/drivers/nouveau/nv50/nv50_screen.h |   35 +
>  2 files changed, 1087 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
> index 1162110..b9d2914 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
> @@ -27,6 +27,8 @@
>  #include "nv50/nv50_context.h"
>  #include "nv_object.xml.h"
>
> +#include "nouveau_perfmon.h"
> +
>  #define NV50_QUERY_STATE_READY   0
>  #define NV50_QUERY_STATE_ACTIVE  1
>  #define NV50_QUERY_STATE_ENDED   2
> @@ -51,10 +53,25 @@ struct nv50_query {
>     boolean is64bit;
>     struct nouveau_mm_allocation *mm;
>     struct nouveau_fence *fence;
> +   struct nouveau_object *perfdom;
>  };
>
>  #define NV50_QUERY_ALLOC_SPACE 256
>
> +#ifdef DEBUG
> +static void nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args);
> +#endif
> +
> +static boolean
> +nv50_hw_pm_query_create(struct nv50_context *, struct nv50_query *);
> +static void
> +nv50_hw_pm_query_destroy(struct nv50_context *, struct nv50_query *);
> +static boolean
> +nv50_hw_pm_query_begin(struct nv50_context *, struct nv50_query *);
> +static void nv50_hw_pm_query_end(struct nv50_context *, struct nv50_query *);
> +static boolean nv50_hw_pm_query_result(struct nv50_context *,
> +                                    struct nv50_query *, boolean, void *);
> +
>  static INLINE struct nv50_query *
>  nv50_query(struct pipe_query *pipe)
>  {
> @@ -96,12 +113,18 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
>  static void
>  nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
>  {
> +   struct nv50_context *nv50 = nv50_context(pipe);
> +   struct nv50_query *q = nv50_query(pq);
> +
>     if (!pq)
>        return;
>
> -   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
> -   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
> -   FREE(nv50_query(pq));
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST))
> +      nv50_hw_pm_query_destroy(nv50, q);
> +
> +   nv50_query_allocate(nv50, q, 0);
> +   nouveau_fence_ref(NULL, &q->fence);
> +   FREE(q);
>  }
>
>  static struct pipe_query *
> @@ -130,6 +153,11 @@ nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
>        q->data -= 32 / sizeof(*q->data); /* we advance before query_begin ! */
>     }
>
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +      if (!nv50_hw_pm_query_create(nv50, q))
> +         return NULL;
> +   }
> +
>     return (struct pipe_query *)q;
>  }
>
> @@ -154,6 +182,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
>     struct nv50_context *nv50 = nv50_context(pipe);
>     struct nouveau_pushbuf *push = nv50->base.pushbuf;
>     struct nv50_query *q = nv50_query(pq);
> +   boolean ret = TRUE;
>
>     if (!pq)
>        return FALSE;
> @@ -211,10 +240,13 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
>        nv50_query_get(push, q, 0x10, 0x00005002);
>        break;
>     default:
> +      if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +         ret = nv50_hw_pm_query_begin(nv50, q);
> +      }
>        break;
>     }
>     q->state = NV50_QUERY_STATE_ACTIVE;
> -   return true;
> +   return ret;
>  }
>
>  static void
> @@ -274,7 +306,9 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
>        q->state = NV50_QUERY_STATE_READY;
>        break;
>     default:
> -      assert(0);
> +      if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +         nv50_hw_pm_query_end(nv50, q);
> +      }
>        break;
>     }
>
> @@ -309,6 +343,10 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
>     if (!pq)
>        return FALSE;
>
> +   if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <= NV50_HW_PM_QUERY_LAST)) {
> +      return nv50_hw_pm_query_result(nv50, q, wait, result);
> +   }
> +
>     if (q->state != NV50_QUERY_STATE_READY)
>        nv50_query_update(q);
>
> @@ -488,6 +526,1015 @@ nva0_so_target_save_offset(struct pipe_context *pipe,
>     nv50_query_end(pipe, targ->pq);
>  }
>
> +/* === HARDWARE GLOBAL PERFORMANCE COUNTERS for NV50 === */
> +
> +struct nv50_hw_pm_source_cfg
> +{
> +   const char *name;
> +   uint64_t value;
> +};
> +
> +struct nv50_hw_pm_signal_cfg
> +{
> +   const char *name;
> +   const struct nv50_hw_pm_source_cfg src[8];
> +};
> +
> +struct nv50_hw_pm_counter_cfg
> +{
> +   uint16_t logic_op;
> +   const struct nv50_hw_pm_signal_cfg sig[4];
> +};
> +
> +enum nv50_hw_pm_query_display
> +{
> +   NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +};
> +
> +enum nv50_hw_pm_query_count
> +{
> +   NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   NV50_HW_PM_EVENT_COUNT_B4,
> +   NV50_HW_PM_EVENT_COUNT_B6,
> +};
> +
> +struct nv50_hw_pm_event_cfg
> +{
> +   const char *name;
> +   const char *desc;
> +   enum nv50_hw_pm_query_display display;
> +   enum nv50_hw_pm_query_count count;
> +   uint8_t domain;
> +};
> +
> +struct nv50_hw_pm_query_cfg
> +{
> +   const struct nv50_hw_pm_event_cfg *event;
> +   const struct nv50_hw_pm_counter_cfg ctr[4];
> +};
> +
> +#define SRC(name, val) { name, val }
> +#define SIG(name, ...) { name, { __VA_ARGS__ } }
> +#define CTR(func, ...) { func, { __VA_ARGS__ } }
> +
> +/*
> + * GPU
> + */
> +/* gpu_idle */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_gpu_idle_event =
> +{
> +   .name    = "gpu_idle",
> +   .desc    = "The \% of time the GPU is idle/busy since the last call. "
> +              "Having the GPU idle at all is a waste of valuable resources. "
> +              "You want to balance the GPU and CPU workloads so that no one "
> +              "processor is starved for work. Time management or using "
> +              "multithreading in your application can help balance CPU based "
> +              "tasks (world management, etc.) with the rendering pipeline.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_gpu_idle_query =
> +{
> +   .event  = &nv50_gpu_idle_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_gr_idle")),
> +};
> +
> +/*
> + * INPUT ASSEMBLER
> + */
> +/* input_assembler_busy */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_ia_busy_event =
> +{
> +   .name    = "input_assembler_busy",
> +   .desc    = "The \% of time the input assembler unit is busy. This is mainly "
> +              "impacted by both the number of vertices processed as well as "
> +              "the size of the attributes on those vertices. You can optimize "
> +              "this by reducing vertex size as much as possible and using "
> +              "indexed primitives to take advantage of the vertex cache.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_ia_busy_query =
> +{
> +   .event   = &nv50_ia_busy_event,
> +   .ctr[0]  = CTR(0xf888, SIG("pc01_vfetch_18",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                          SIG("pc01_vfetch_17"),
> +                          SIG("pc01_vfetch_03"),
> +                          SIG("pc01_vfetch_02")),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nva0_ia_busy_query =
> +{
> +   .event   = &nv50_ia_busy_event,
> +   .ctr[0]  = CTR(0xf888, SIG("pc01_vfetch_15",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                          SIG("pc01_vfetch_14"),
> +                          SIG("pc01_vfetch_03"),
> +                          SIG("pc01_vfetch_02")),
> +};
> +
> +/* input_assembler_waits_for_fb */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_ia_waits_for_fb_event = {
> +   .name    = "input_assembler_waits_for_fb",
> +   .desc    = "This is the amount of time the input assembler unit was "
> +              "waiting for data from the frame buffer unit.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_ia_waits_for_fb_query =
> +{
> +   .event   = &nv50_ia_waits_for_fb_event,
> +   .ctr[0]  = CTR(0xaaaa, SIG("pc01_vfetch_0e",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nva0_ia_waits_for_fb_query =
> +{
> +   .event   = &nv50_ia_waits_for_fb_event,
> +   .ctr[0]  = CTR(0xaaaa, SIG("pc01_vfetch_0b",
> +                              SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
> +};
> +
> +/* vertex_attribute_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_vertex_attr_count_event =
> +{
> +   .name    = "vertex_attribute_count",
> +   .desc    = "The number of vertex attributes that are fetched and passed to "
> +              "the geometry unit is returned in this counter. A large number "
> +              "of attributes (or unaligned vertices) can hurt vertex cache "
> +              "performance and reduce the overall vertex processing "
> +              "capabilities of the pipeline.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_vertex_attr_count_query =
> +{
> +   .event = &nv50_vertex_attr_count_event,
> +   .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_18",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                         SIG("pc01_vfetch_17"),
> +                         SIG("pc01_vfetch_03"),
> +                         SIG("pc01_vfetch_02")),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nva0_vertex_attr_count_query =
> +{
> +   .event  = &nv50_vertex_attr_count_event,
> +   .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_15",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
> +                         SIG("pc01_vfetch_14"),
> +                         SIG("pc01_vfetch_03"),
> +                         SIG("pc01_vfetch_02")),
> +};
> +
> +/*
> + * GEOM
> + */
> +/* geom_vertex_in_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_vertex_in_count_event =
> +{
> +   .name    = "geom_vertex_in_count",
> +   .desc    = "The number of vertices input to the geom unit.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B4,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_vertex_in_count_query =
> +{
> +   .event  = &nv50_geom_vertex_in_count_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc01_vfetch_0e",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x0)),
> +                         SIG("pc01_vfetch_0f"),
> +                         SIG("pc01_vfetch_10"),
> +                         SIG("pc01_trailer")),
> +   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_trailer")),
> +};
> +
> +/* geom_vertex_out_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_vertex_out_count_event =
> +{
> +   .name    = "geom_vertex_out_count",
> +   .desc    = "The number of vertices coming out of the geom unit after any "
> +              "geometry shader expansion.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_vertex_out_count_query =
> +{
> +   .event  = &nv50_geom_vertex_out_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_01")),
> +};
> +
> +/* geom_primitive_in_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_primitive_in_count_event =
> +{
> +   .name    = "geom_primitive_in_count",
> +   .desc    = "The number of primitives input to the geom unit.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_primitive_in_count_query =
> +{
> +   .event  = &nv50_geom_primitive_in_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_vfetch_08",
> +                             SRC("pgraph_vfetch_unk0c_unk0", 0x0))),
> +};
> +
> +/* geom_primitive_out_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_geom_primitive_out_count_event =
> +{
> +   .name    = "geom_primitive_out_count",
> +   .desc    = "The number of primitives coming out the geom unit after any "
> +              "geometry shader expansion.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_geom_primitive_out_count_query =
> +{
> +   .event  = &nv50_geom_primitive_out_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_00")),
> +};
> +
> +/*
> + * STREAM OUT
> + */
> +/* stream_out_busy */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_so_busy_event =
> +{
> +   .name    = "stream_out_busy",
> +   .desc    = "This unit manages the writing of vertices to the frame buffer "
> +              "when using stream out. If a significant number of vertices are "
> +              "written, this can become a bottleneck.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_so_busy_query =
> +{
> +   .event  = &nv50_so_busy_event,
> +   .ctr[0] = CTR(0x8888, SIG("pc01_strmout_00"),
> +                         SIG("pc01_strmout_01")),
> +};
> +
> +/*
> + * SETUP
> + */
> +/* setup_primitive_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_primitive_count_event =
> +{
> +   .name    = "setup_primitive_count",
> +   .desc    = "Returns the number of primitives processed in the geometry "
> +              "subsystem. This experiments counts points, lines and triangles. "
> +              "To count only triangles, use the setup_triangle_count counter. "
> +              "Balance these counts with the number of pixels being drawn to "
> +              "see if you could simplify your geometry and use "
> +              "bump/displacement maps, for example.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_primitive_count_query =
> +{
> +   .event  = &nv50_setup_primitive_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_trast_00")),
> +};
> +
> +/* setup_point_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_point_count_event =
> +{
> +   .name    = "setup_point_count",
> +   .desc    = "The number of points seen by the primitive setup unit (just "
> +              "before rasterization).",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_point_count_query =
> +{
> +   .event  = &nv50_setup_point_count_event,
> +   .ctr[0] = CTR(0x8080, SIG("pc01_trast_01"),
> +                         SIG("pc01_trast_04"),
> +                         SIG("pc01_trast_05")),
> +};
> +
> +/* setup_line_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_line_count_event =
> +{
> +   .name    = "setup_line_count",
> +   .desc    = "The number of lines seen by the primitive setup unit (just "
> +              "before rasterization).",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_line_count_query =
> +{
> +   .event  = &nv50_setup_line_count_event,
> +   .ctr[0] = CTR(0x8080, SIG("pc01_trast_02"),
> +                         SIG("pc01_trast_04"),
> +                         SIG("pc01_trast_05")),
> +};
> +
> +/* setup_triangle_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_triangle_count_event =
> +{
> +   .name    = "setup_triangle_count",
> +   .desc    = "Returns the number of triangles processed in the geometry "
> +              "subsystem.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_triangle_count_query =
> +{
> +   .event  = &nv50_setup_triangle_count_event,
> +   .ctr[0] = CTR(0x8080, SIG("pc01_trast_03"),
> +                         SIG("pc01_trast_04"),
> +                         SIG("pc01_trast_05")),
> +};
> +
> +/* setup_primitive_culled_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_setup_primitive_culled_count_event =
> +{
> +   .name    = "setup_primitive_culled_count",
> +   .desc    = "Returns the number of primitives culled in primitive setup. If "
> +              "you are performing viewport culling, this gives you an "
> +              "indication of the accuracy of the algorithm being used, and can "
> +              "give you and idea if you need to improves this culling. This "
> +              "includes primitives culled when using backface culling. Drawing "
> +              "a fully visible sphere on the screen should cull half of the "
> +              "triangles if backface culling is turned on and all the "
> +              "triangles are ordered consistently (CW or CCW).",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_setup_primitive_culled_count_query =
> +{
> +   .event  = &nv50_setup_primitive_culled_count_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc01_unk00")),
> +};
> +
> +/*
> + * RASTERIZER
> + */
> +/* rast_tiles_killed_by_zcull_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rast_tiles_killed_by_zcull_event =
> +{
> +   .name    = "rasterizer_tiles_killed_by_zcull_count",
> +   .desc    = "The number of pixels killed by the zcull unit in the rasterizer.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rast_tiles_killed_by_zcull_query =
> +{
> +   .event  = &nv50_rast_tiles_killed_by_zcull_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
> +                             SRC("pgraph_zcull_pm_unka4_unk0", 0x7)),
> +                         SIG("pc01_zcull_01"),
> +                         SIG("pc01_zcull_02"),
> +                         SIG("pc01_zcull_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_zcull_04"),
> +                         SIG("pc01_zcull_05")),
> +};
> +
> +/* rast_tiles_in_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rast_tiles_in_count_event =
> +{
> +   .name    = "rasterizer_tiles_in_count",
> +   .desc    = "Count of tiles (each of which contain 1-8 pixels) seen by the "
> +              "rasterizer stage.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 1,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rast_tiles_in_count_query =
> +{
> +   .event  = &nv50_rast_tiles_in_count_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
> +                             SRC("pgraph_zcull_pm_unka4_unk0", 0x0)),
> +                         SIG("pc01_zcull_01"),
> +                         SIG("pc01_zcull_02"),
> +                         SIG("pc01_zcull_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
> +                         SIG("pc01_trailer"),
> +                         SIG("pc01_zcull_04"),
> +                         SIG("pc01_zcull_05")),
> +};
> +
> +/*
> + * ROP
> + */
> +/* rop_busy */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_busy_event =
> +{
> +   .name    = "rop_busy",
> +   .desc    = "\% of time that the ROP unit is actively doing work. "
> +              "This can be high if alpha blending is turned on, of overdraw "
> +              "is high, etc.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_busy_query =
> +{
> +   .event  = &nv50_rop_busy_event,
> +   .ctr[0] = CTR(0xf888, SIG("pc02_prop_02",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
> +                         SIG("pc02_prop_03"),
> +                         SIG("pc02_prop_04"),
> +                         SIG("pc02_prop_05")),
> +};
> +
> +/* rop_waits_for_fb */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_waits_for_fb_event =
> +{
> +   .name    = "rop_waits_for_fb",
> +   .desc    = "The amount of time the blending unit spent waiting for data "
> +              "from the frame buffer unit. If blending is enabled and there "
> +              "is a lot of traffic here (since this is a read/modify/write "
> +              "operation) this can become a bottleneck.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_waits_for_fb_query =
> +{
> +   .event  = &nv50_rop_waits_for_fb_event,
> +   .ctr[0] = CTR(0x22f2, SIG("pc02_crop_03",
> +                             SRC("pgraph_rop0_crop_pm_mux_sel0", 0x0)),
> +                         SIG("pc02_crop_02"),
> +                         SIG("pc02_zrop_03",
> +                             SRC("pgraph_rop0_zrop_pm_mux_sel0", 0x0)),
> +                         SIG("pc02_zrop_02")),
> +};
> +
> +/* rop_waits_for_shader */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_waits_for_shader_event =
> +{
> +   .name    = "rop_waits_for_shader",
> +   .desc    = "This is a measurement of how often the blending unit was "
> +              "waiting on new work (fragments to be placed into the render "
> +              "target). If the pixel shaders are particularly expensive, the "
> +              "ROP unit could be starved waiting for results.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_waits_for_shader_query =
> +{
> +   .event  = &nv50_rop_waits_for_shader_event,
> +   .ctr[0] = CTR(0x2222, SIG("pc02_prop_6",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
> +                         SIG("pc02_prop_7")),
> +};
> +
> +/* rop_samples_killed_by_earlyz_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_samples_killed_by_earlyz_event =
> +{
> +   .name    = "rop_samples_killed_by_earlyz_count",
> +   .desc    = "This returns the number of pixels that were killed in the "
> +              "earlyZ hardware. This signal will give you an idea of, for "
> +              "instance, a Z only pass was successful in setting up the depth "
> +              "buffer.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_samples_killed_by_earlyz_query =
> +{
> +   .event  = &nv50_rop_samples_killed_by_earlyz_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1a)),
> +                         SIG("pc02_prop_01"),
> +                         SIG("pc02_prop_02"),
> +                         SIG("pc02_prop_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
> +                         SIG("pc02_trailer"),
> +                         SIG("pc02_prop_04"),
> +                         SIG("pc02_prop_05")),
> +};
> +
> +/* rop_samples_killed_by_latez_count */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_rop_samples_killed_by_latez_event =
> +{
> +   .name    = "rop_samples_killed_by_latez_count",
> +   .desc    = "This returns the number of pixels that were killed after the "
> +              "pixel shader ran. This can happen if the early Z is unable to "
> +              "cull the pixel because of an API setup issue like changing the "
> +              "Z direction or modifying Z in the pixel shader.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_B6,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_rop_samples_killed_by_latez_query =
> +{
> +   .event  = &nv50_rop_samples_killed_by_latez_event,
> +   .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
> +                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1b)),
> +                         SIG("pc02_prop_01"),
> +                         SIG("pc02_prop_02"),
> +                         SIG("pc02_prop_03")),
> +   .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
> +                         SIG("pc02_trailer"),
> +                         SIG("pc02_prop_04"),
> +                         SIG("pc02_prop_05")),
> +};
> +
> +/*
> + * TEXTURE
> + */
> +/* tex_cache_miss */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_tex_cache_miss_event =
> +{
> +   .name    = "tex_cache_miss",
> +   .desc    = "Number of texture cache misses.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_tex_cache_miss_query =
> +{
> +   .event  = &nv50_tex_cache_miss_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x200))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv84_tex_cache_miss_query =
> +{
> +   .event  = &nv50_tex_cache_miss_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x800))),
> +};
> +
> +/* tex_cache_hit */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_tex_cache_hit_event =
> +{
> +   .name    = "tex_cache_hit",
> +   .desc    = "Number of texture cache hits.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_tex_cache_hit_query =
> +{
> +   .event  = &nv50_tex_cache_hit_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x200))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv84_tex_cache_hit_query =
> +{
> +   .event  = &nv50_tex_cache_hit_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x800))),
> +};
> +
> +/* tex_waits_for_fb */
> +static const struct nv50_hw_pm_event_cfg
> +nv50_tex_waits_for_fb_event =
> +{
> +   .name    = "tex_waits_for_fb",
> +   .desc    = "This is the amount of time the texture unit spent waiting on "
> +              "samples to return from the frame buffer unit. It is a potential "
> +              "indication of poor texture cache utilization.",
> +   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
> +   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
> +   .domain  = 2,
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv50_tex_waits_for_fb_query =
> +{
> +   .event  = &nv50_tex_waits_for_fb_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x200))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg
> +nv84_tex_waits_for_fb_query =
> +{
> +   .event  = &nv50_tex_waits_for_fb_event,
> +   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",
> +                             SRC("pgraph_tpc0_tex_unk08_unk0", 0x800))),
> +};
> +
> +static const struct nv50_hw_pm_query_cfg *nv50_hw_pm_queries[NV50_HW_PM_QUERY_COUNT];
> +
> +#define _Q(n, q) nv50_hw_pm_queries[NV50_HW_PM_QUERY_##n] = &q;
> +
> +static void
> +nv50_identify_events(struct nv50_screen *screen)
> +{
> +  _Q(GPU_IDLE,                      nv50_gpu_idle_query);
> +  _Q(IA_BUSY,                       nv50_ia_busy_query);
> +  _Q(IA_WAITS_FOR_FB,               nv50_ia_waits_for_fb_query);
> +  _Q(VERTEX_ATTR_COUNT,             nv50_vertex_attr_count_query);
> +  _Q(GEOM_VERTEX_IN_COUNT,          nv50_geom_vertex_in_count_query);
> +  _Q(GEOM_VERTEX_OUT_COUNT,         nv50_geom_vertex_out_count_query);
> +  _Q(GEOM_PRIMITIVE_IN_COUNT,       nv50_geom_primitive_in_count_query);
> +  _Q(GEOM_PRIMITIVE_OUT_COUNT,      nv50_geom_primitive_out_count_query);
> +  _Q(SO_BUSY,                       nv50_so_busy_query);
> +  _Q(SETUP_PRIMITIVE_COUNT,         nv50_setup_primitive_count_query);
> +  _Q(SETUP_POINT_COUNT,             nv50_setup_point_count_query);
> +  _Q(SETUP_LINE_COUNT,              nv50_setup_line_count_query);
> +  _Q(SETUP_TRIANGLE_COUNT,          nv50_setup_triangle_count_query);
> +  _Q(SETUP_PRIMITIVE_CULLED_COUNT,  nv50_setup_primitive_culled_count_query);
> +  _Q(RAST_TILES_KILLED_BY_ZCULL,    nv50_rast_tiles_killed_by_zcull_query);
> +  _Q(RAST_TILES_IN_COUNT,           nv50_rast_tiles_in_count_query);
> +  _Q(ROP_BUSY,                      nv50_rop_busy_query);
> +  _Q(ROP_WAITS_FOR_FB,              nv50_rop_waits_for_fb_query);
> +  _Q(ROP_WAITS_FOR_SHADER,          nv50_rop_waits_for_shader_query);
> +  _Q(ROP_SAMPLES_KILLED_BY_EARLYZ,  nv50_rop_samples_killed_by_earlyz_query);
> +  _Q(ROP_SAMPLES_KILLED_BY_LATEZ,   nv50_rop_samples_killed_by_latez_query );
> +  _Q(TEX_CACHE_MISS,                nv50_tex_cache_miss_query);
> +  _Q(TEX_CACHE_HIT,                 nv50_tex_cache_hit_query);
> +  _Q(TEX_WAITS_FOR_FB,              nv50_tex_waits_for_fb_query);
> +
> +   if (screen->base.class_3d >= NV84_3D_CLASS) {
> +      /* Variants for NV84+ */
> +      _Q(TEX_CACHE_MISS,   nv84_tex_cache_miss_query);
> +      _Q(TEX_CACHE_HIT,    nv84_tex_cache_hit_query);
> +      _Q(TEX_WAITS_FOR_FB, nv84_tex_waits_for_fb_query);
> +   }
> +
> +   if (screen->base.class_3d >= NVA0_3D_CLASS) {
> +      /* Variants for NVA0+ */
> +      _Q(IA_BUSY,           nva0_ia_busy_query);
> +      _Q(IA_WAITS_FOR_FB,   nva0_ia_waits_for_fb_query);
> +      _Q(VERTEX_ATTR_COUNT, nva0_vertex_attr_count_query);
> +   }
> +}
> +
> +#undef _Q
> +
> +#ifdef DEBUG
> +static void
> +nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args)
> +{
> +   int i, j, k;
> +
> +   debug_printf("PERFDOM CONFIGURATION:\n");
> +   debug_printf("domaine: 0x%02x\n", args->domain);
> +   debug_printf("mode: 0x%02x\n", args->mode);
> +   for (i = 0; i < 4; i++) {
> +      uint32_t signal = 0;
> +      for (j = 0; j < 4; j++)
> +         signal |= args->ctr[i].signal[j] << (j * 8);
> +
> +      debug_printf("ctr[%d]: func = 0x%04x, signal=0x%08x\n",
> +                   i, args->ctr[i].logic_op, signal);
> +
> +      for (j = 0; j < 4; j++) {
> +         for (k = 0; k < 8; k++) {
> +            uint32_t source, value;
> +            if (!args->ctr[i].source[j][k])
> +               continue;
> +
> +            source = args->ctr[i].source[j][k];
> +            value  = args->ctr[i].source[j][k] >> 32;
> +            debug_printf("  src[%d][%d]: source = 0x%08x, value = 0x%08x\n",
> +                         j, k, source, value);
> +         }
> +      }
> +   }
> +}
> +#endif
> +
> +static const struct nv50_hw_pm_query_cfg *
> +nv50_hw_pm_query_get_cfg(struct nv50_screen *screen, uint32_t query_type)
> +{
> +   return nv50_hw_pm_queries[query_type - NV50_HW_PM_QUERY(0)];
> +}
> +
> +static boolean
> +nv50_hw_pm_query_create(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nv50_screen *screen = nv50->screen;
> +   struct nouveau_perfmon *perfmon = screen->base.perfmon;
> +   static const struct nv50_hw_pm_query_cfg *cfg;
> +   struct nvif_perfdom_v0 args = {};
> +   struct nouveau_perfmon_dom *dom;
> +   int i, j, k;
> +   int ret;
> +
> +   if (!screen->pm.num_active) {
> +      /* TODO: Currently, only one query type can be monitored simultaneously
> +       * because the Gallium's HUD doesn't fit well with the perfdom interface.
> +       *
> +       * With two different query types, the current scenario is as follows:
> +       * CREATE Q1, BEGIN Q1, CREATE Q2, BEGIN Q2, END Q1, RESULT Q1, BEGIN Q1,
> +       * END Q2, RESULT Q2, BEGIN Q2, END Q1, and so on.
> +       *
> +       * This behaviour doesn't allow to schedule multiple counters because
> +       * we have to do that at query creation (ie. when a perfdom is created).
> +       *
> +       * To get rid of this limitation, a better scenario would be:
> +       * CREATE Q1, CREATE Q2, BEGIN Q1, BEGIN Q2, END Q1, END Q2, RESULT Q1,
> +       * RESULT Q2, BEGIN Q1, BEGIN Q2, END Q1, and so on.
> +       *
> +       * With this kind of behaviour, we could introduce
> +       * {create,begin,end}_all_queries() functions to be able to configure
> +       * all queries in one shot.
> +       */
> +      screen->pm.query_type = q->type;
> +   }
> +   screen->pm.num_active++;
> +
> +   if (screen->pm.query_type != q->type) {
> +      NOUVEAU_ERR("Only one query type can be monitored at the same time!");
> +      return FALSE;
> +   }
> +
> +   cfg = nv50_hw_pm_query_get_cfg(nv50->screen, q->type);
> +
> +   dom = nouveau_perfmon_get_dom_by_id(perfmon, cfg->event->domain);
> +   if (!dom) {
> +      NOUVEAU_ERR("Failed to find domain %d\n", cfg->event->domain);
> +      return FALSE;
> +   }
> +
> +   /* configure domain and counting mode */
> +   args.domain = dom->id;
> +   args.mode   = cfg->event->count;
> +
> +   /* configure counters for this hardware event */
> +   for (i = 0; i < ARRAY_SIZE(cfg->ctr); i++) {
> +      const struct nv50_hw_pm_counter_cfg *sctr = &cfg->ctr[i];
> +
> +      if (!sctr->logic_op)
> +         continue;
> +      args.ctr[i].logic_op = sctr->logic_op;
> +
> +      /* configure signals for this counter */
> +      for (j = 0; j < ARRAY_SIZE(sctr->sig); j++) {
> +         const struct nv50_hw_pm_signal_cfg *ssig = &sctr->sig[j];
> +         struct nouveau_perfmon_sig *sig;
> +
> +         if (!ssig->name)
> +            continue;
> +
> +         sig = nouveau_perfmon_get_sig_by_name(dom, ssig->name);
> +         if (!sig) {
> +            NOUVEAU_ERR("Failed to find signal %s\n", ssig->name);
> +            return FALSE;
> +         }
> +         args.ctr[i].signal[j] = sig->signal;
> +
> +         /* configure sources for this signal */
> +         for (k = 0; k < ARRAY_SIZE(ssig->src); k++) {
> +            const struct nv50_hw_pm_source_cfg *ssrc = &ssig->src[k];
> +            struct nouveau_perfmon_src *src;
> +
> +            if (!ssrc->name)
> +               continue;
> +
> +            src = nouveau_perfmon_get_src_by_name(sig, ssrc->name);
> +            if (!src) {
> +               NOUVEAU_ERR("Failed to find source %s\n", ssrc->name);
> +               return FALSE;
> +            }
> +            args.ctr[i].source[j][k] = (ssrc->value << 32) | src->id;
> +         }
> +      }
> +   }
> +
> +#ifdef DEBUG
> +   if (debug_get_num_option("NV50_PM_DEBUG", 0))
> +      nv50_hw_pm_dump_perfdom(&args);
> +#endif
> +
> +   ret = nouveau_object_new(perfmon->object, perfmon->handle++,
> +                            NVIF_IOCTL_NEW_V0_PERFDOM,
> +                            &args, sizeof(args), &q->perfdom);
> +   if (ret) {
> +      NOUVEAU_ERR("Failed to create perfdom object: %d\n", ret);
> +      return FALSE;
> +   }
> +
> +   return TRUE;
> +}
> +
> +static void
> +nv50_hw_pm_query_destroy(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nv50_screen *screen = nv50->screen;
> +
> +   nouveau_object_del(&q->perfdom);
> +   screen->pm.num_active--;
> +}
> +
> +static boolean
> +nv50_hw_pm_query_begin(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nouveau_pushbuf *push = nv50->base.pushbuf;
> +
> +   /* start the next batch of counters */
> +   PUSH_SPACE(push, 2);
> +   BEGIN_NV04(push, SUBC_SW(0x0608), 1);
> +   PUSH_DATA (push, q->perfdom->handle);
> +
> +   return TRUE;
> +}
> +
> +static void
> +nv50_hw_pm_query_end(struct nv50_context *nv50, struct nv50_query *q)
> +{
> +   struct nouveau_pushbuf *push = nv50->base.pushbuf;
> +   struct nv50_screen *screen = nv50->screen;
> +
> +   /* set sequence field (used to check if result is available) */
> +   q->sequence = ++screen->pm.sequence;
> +
> +   /* sample the previous batch of counters */
> +   PUSH_SPACE(push, 2);
> +   BEGIN_NV04(push, SUBC_SW(0x060c), 1);
> +   PUSH_DATA (push, q->perfdom->handle);
> +
> +   /* read back counters values */
> +   PUSH_SPACE(push, 2);

Do this once as PUSH_SPACE(4). Or even better, PUSH_SPACE(3) and only
do 1 begin with length 2.

> +   BEGIN_NV04(push, SUBC_SW(0x0700), 1);
> +   PUSH_DATA (push, screen->pm.sequence);
> +}
> +
> +static volatile void *
> +nv50_ntfy(struct nv50_screen *screen)
> +{
> +   struct nv04_notify *query = screen->query->data;
> +   struct nouveau_bo *notify = screen->notify_bo;
> +
> +   return (char *)notify->map + query->offset;
> +}
> +
> +static INLINE uint32_t
> +nv50_hw_pm_query_get_offset(struct nv50_query *q)
> +{
> +   return (1 + (q->sequence % NV50_HW_PM_RING_BUFFER_MAX_QUERIES) *
> +           NV50_HW_PM_RING_BUFFER_NUM_DOMAINS * 6);
> +}
> +
> +static INLINE boolean
> +nv50_hw_pm_query_read_data(struct nv50_context *nv50, struct nv50_query *q,
> +                           boolean wait, uint32_t ctr[4], uint32_t *clk)
> +{
> +   volatile uint32_t *ntfy = nv50_ntfy(nv50->screen);
> +   uint32_t offset = nv50_hw_pm_query_get_offset(q);
> +   boolean found = FALSE;
> +   int i;
> +
> +   while (ntfy[0] < q->sequence) {
> +      if (!wait)
> +         return FALSE;
> +      usleep(100);
> +   }

Yeah this won't fly. Take a look at nouveau_fence_wait for it does
that. I don't suppose you can hook into the fence mechanism for all
this, instead of implementing your own version, right?

BTW, what makes sure that the query has been kicked out? You never do
a PUSH_KICK that I can see...

> +
> +   if (ntfy[0] > q->sequence + NV50_HW_PM_RING_BUFFER_MAX_QUERIES - 1)
> +      return FALSE;
> +
> +   for (i = 0; i < NV50_HW_PM_RING_BUFFER_NUM_DOMAINS; i++) {
> +      if (ntfy[offset + i * 6] == q->perfdom->handle) {
> +         found = TRUE;
> +         break;
> +      }
> +   }
> +
> +   if (!found) {
> +      NOUVEAU_ERR("Failed to find perfdom object %" PRIu64 "!\n",
> +                  q->perfdom->handle);
> +      return FALSE;
> +   }
> +
> +   for (i = 0; i < 4; i++)
> +      ctr[i] = ntfy[offset + i + 1];
> +   *clk = ntfy[offset + 5];
> +
> +   return TRUE;
> +}
> +
> +static boolean
> +nv50_hw_pm_query_result(struct nv50_context *nv50, struct nv50_query *q,
> +                        boolean wait, void *result)
> +{
> +   struct nv50_screen *screen = nv50->screen;
> +   const struct nv50_hw_pm_query_cfg *cfg;
> +   uint32_t ctr[4], clk;
> +   uint64_t value = 0;
> +   int ret;
> +
> +   ret = nv50_hw_pm_query_read_data(nv50, q, wait, ctr, &clk);
> +   if (!ret)
> +      return FALSE;
> +
> +   cfg = nv50_hw_pm_query_get_cfg(screen, q->type);
> +   if (cfg->event->count == NV50_HW_PM_EVENT_COUNT_SIMPLE) {
> +      /* SIMPLE hardware events are sampled on PRE_CTR. */
> +      value = ctr[0];
> +   } else {
> +      /* EVENT_B4/EVENT_B6 hardware events are sampled on EVENT_CTR. */
> +      value = ctr[2];
> +   }
> +
> +   if (cfg->event->display == NV50_HW_PM_EVENT_DISPLAY_RATIO) {
> +      if (clk)
> +         value = (value * 100) / (float)clk;
> +   }
> +
> +   fprintf(stderr, "ctr[0]=%d, ctr[1]=%d, ctr[2]=%d, ctr[3]=%d, clk=%d, val=%d\n",
> +           ctr[0], ctr[1], ctr[2], ctr[3], clk, value);
> +
> +   *(uint64_t *)result = value;
> +   return TRUE;
> +}
> +
>  void
>  nv50_init_query_functions(struct nv50_context *nv50)
>  {
> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> index 71a5247..0449659 100644
> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
> @@ -89,6 +89,12 @@ struct nv50_screen {
>        struct nouveau_bo *bo;
>     } fence;
>
> +   struct {
> +      uint32_t sequence;
> +      uint32_t query_type;
> +      uint32_t num_active;
> +   } pm;
> +
>     struct nouveau_object *sync;
>     struct nouveau_object *query;
>
> @@ -108,6 +114,35 @@ nv50_screen(struct pipe_screen *screen)
>     return (struct nv50_screen *)screen;
>  }
>
> +/* Hardware global performance counters. */
> +#define NV50_HW_PM_QUERY_COUNT  24
> +#define NV50_HW_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
> +#define NV50_HW_PM_QUERY_LAST   NV50_HW_PM_QUERY(NV50_HW_PM_QUERY_COUNT - 1)
> +#define NV50_HW_PM_QUERY_GPU_IDLE                            0
> +#define NV50_HW_PM_QUERY_IA_BUSY                             1
> +#define NV50_HW_PM_QUERY_IA_WAITS_FOR_FB                     2
> +#define NV50_HW_PM_QUERY_VERTEX_ATTR_COUNT                   3
> +#define NV50_HW_PM_QUERY_GEOM_VERTEX_IN_COUNT                4
> +#define NV50_HW_PM_QUERY_GEOM_VERTEX_OUT_COUNT               5
> +#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_IN_COUNT             6
> +#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_OUT_COUNT            7
> +#define NV50_HW_PM_QUERY_SO_BUSY                             8
> +#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_COUNT               9
> +#define NV50_HW_PM_QUERY_SETUP_POINT_COUNT                  10
> +#define NV50_HW_PM_QUERY_SETUP_LINE_COUNT                   11
> +#define NV50_HW_PM_QUERY_SETUP_TRIANGLE_COUNT               12
> +#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_CULLED_COUNT       13
> +#define NV50_HW_PM_QUERY_RAST_TILES_KILLED_BY_ZCULL         14
> +#define NV50_HW_PM_QUERY_RAST_TILES_IN_COUNT                15
> +#define NV50_HW_PM_QUERY_ROP_BUSY                           16
> +#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_FB                   17
> +#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_SHADER               18
> +#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_EARLYZ       19
> +#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_LATEZ        20
> +#define NV50_HW_PM_QUERY_TEX_CACHE_MISS                     21
> +#define NV50_HW_PM_QUERY_TEX_CACHE_HIT                      22
> +#define NV50_HW_PM_QUERY_TEX_WAITS_FOR_FB                   23
> +
>  boolean nv50_blitter_create(struct nv50_screen *);
>  void nv50_blitter_destroy(struct nv50_screen *);
>
> --
> 2.4.4
>
> _______________________________________________
> Nouveau mailing list
> Nouveau at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau