[Mesa-dev] [PATCH v2 5/7] nv50: add support for compute/graphics global performance counters
Samuel Pitoiset
samuel.pitoiset at gmail.com
Thu Jul 23 08:21:45 PDT 2015
On 07/23/2015 12:05 AM, Martin Peres wrote:
> On 01/07/15 01:01, Samuel Pitoiset wrote:
>> This commit adds support for both compute and graphics global
>> performance counters which have been reverse engineered with
>> CUPTI (Linux) and PerfKit (Windows).
>>
>> Currently, only one query type can be monitored at the same time because
>> the Gallium's HUD doesn't fit pretty well. This will be improved later.
>>
>> Changes since v2:
>> - replace \% by percentage
>> - remove one extra call to PUSH_SPACE
>> - use nouveau_fence instead of my hand-made fence mechanism
>>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>> ---
>> src/gallium/drivers/nouveau/nv50/nv50_query.c | 1066
>> +++++++++++++++++++++++-
>> src/gallium/drivers/nouveau/nv50/nv50_screen.h | 35 +
>> 2 files changed, 1096 insertions(+), 5 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c
>> b/src/gallium/drivers/nouveau/nv50/nv50_query.c
>> index 81f7474..7fb6f3a 100644
>> --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
>> +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
>> @@ -27,6 +27,8 @@
>> #include "nv50/nv50_context.h"
>> #include "nv_object.xml.h"
>> +#include "nouveau_perfmon.h"
>> +
>> #define NV50_QUERY_STATE_READY 0
>> #define NV50_QUERY_STATE_ACTIVE 1
>> #define NV50_QUERY_STATE_ENDED 2
>> @@ -51,10 +53,25 @@ struct nv50_query {
>> boolean is64bit;
>> struct nouveau_mm_allocation *mm;
>> struct nouveau_fence *fence;
>> + struct nouveau_object *perfdom;
>> };
>> #define NV50_QUERY_ALLOC_SPACE 256
>> +#ifdef DEBUG
> No need to guard the definition of this function. The compiler will
> get rid of it if it has no users.
Fixed.
>> +static void nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args);
>> +#endif
>> +
>> +static boolean
>> +nv50_hw_pm_query_create(struct nv50_context *, struct nv50_query *);
>> +static void
>> +nv50_hw_pm_query_destroy(struct nv50_context *, struct nv50_query *);
>> +static boolean
>> +nv50_hw_pm_query_begin(struct nv50_context *, struct nv50_query *);
>> +static void nv50_hw_pm_query_end(struct nv50_context *, struct
>> nv50_query *);
>> +static boolean nv50_hw_pm_query_result(struct nv50_context *,
>> + struct nv50_query *, boolean,
>> void *);
>> +
>> static INLINE struct nv50_query *
>> nv50_query(struct pipe_query *pipe)
>> {
>> @@ -96,9 +113,15 @@ nv50_query_allocate(struct nv50_context *nv50,
>> struct nv50_query *q, int size)
>> static void
>> nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
>> {
>> - nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
>> - nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
>> - FREE(nv50_query(pq));
>> + struct nv50_context *nv50 = nv50_context(pipe);
>> + struct nv50_query *q = nv50_query(pq);
>> +
>> + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=
>> NV50_HW_PM_QUERY_LAST))
>> + nv50_hw_pm_query_destroy(nv50, q);
>> +
>> + nv50_query_allocate(nv50, q, 0);
>> + nouveau_fence_ref(NULL, &q->fence);
>> + FREE(q);
>> }
>> static struct pipe_query *
>> @@ -120,6 +143,12 @@ nv50_query_create(struct pipe_context *pipe,
>> unsigned type, unsigned index)
>> type == PIPE_QUERY_PRIMITIVES_EMITTED ||
>> type == PIPE_QUERY_SO_STATISTICS ||
>> type == PIPE_QUERY_PIPELINE_STATISTICS);
>> + if (type >= NV50_HW_PM_QUERY(0) && q->type <=
>> NV50_HW_PM_QUERY_LAST) {
>> + /* Hardware global performance counters are not 64 bits, but
>> we also use
>> + * a fence to make sure the query is ready. */
>
> I do not understand the logic of this comment.
Only 64-bits queries use a nouveau_fence to make sure result is
available. 32-bits queries use a hand-made sequence number.
Global PM are declared as 32-bits queries but we also use a
nouveau_fence to check the result.
I'll rewrite that comment.
>> + q->is64bit = TRUE;
>> + }
>> +
>> q->type = type;
>> if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
>> @@ -127,6 +156,11 @@ nv50_query_create(struct pipe_context *pipe,
>> unsigned type, unsigned index)
>> q->data -= 32 / sizeof(*q->data); /* we advance before
>> query_begin ! */
>> }
>> + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=
>> NV50_HW_PM_QUERY_LAST)) {
>> + if (!nv50_hw_pm_query_create(nv50, q))
>> + return NULL;
>> + }
>> +
>> return (struct pipe_query *)q;
>> }
>> @@ -151,6 +185,7 @@ nv50_query_begin(struct pipe_context *pipe,
>> struct pipe_query *pq)
>> struct nv50_context *nv50 = nv50_context(pipe);
>> struct nouveau_pushbuf *push = nv50->base.pushbuf;
>> struct nv50_query *q = nv50_query(pq);
>> + boolean ret = TRUE;
>> /* For occlusion queries we have to change the storage,
>> because a previous
>> * query might set the initial render conition to FALSE even
>> *after* we re-
>> @@ -205,10 +240,13 @@ nv50_query_begin(struct pipe_context *pipe,
>> struct pipe_query *pq)
>> nv50_query_get(push, q, 0x10, 0x00005002);
>> break;
>> default:
>> + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=
>> NV50_HW_PM_QUERY_LAST)) {
>> + ret = nv50_hw_pm_query_begin(nv50, q);
>> + }
>> break;
>> }
>> q->state = NV50_QUERY_STATE_ACTIVE;
>> - return true;
>> + return ret;
>> }
>> static void
>> @@ -265,7 +303,9 @@ nv50_query_end(struct pipe_context *pipe, struct
>> pipe_query *pq)
>> q->state = NV50_QUERY_STATE_READY;
>> break;
>> default:
>> - assert(0);
>> + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=
>> NV50_HW_PM_QUERY_LAST)) {
>> + nv50_hw_pm_query_end(nv50, q);
>> + }
> I get the idea, but deleting assert(0) is not acceptable. Why don't
> you move it to after your if and add a break at the end of the if
> block? This way, you preserve the old behaviour :)
I'll keep that assertion but adding it in end_query() doesn't make any
sense. It should be added (at least) in begin_query().
>> break;
>> }
>> @@ -300,6 +340,10 @@ nv50_query_result(struct pipe_context *pipe,
>> struct pipe_query *pq,
>> if (q->state != NV50_QUERY_STATE_READY)
>> nv50_query_update(q);
>> + if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=
>> NV50_HW_PM_QUERY_LAST)) {
>> + return nv50_hw_pm_query_result(nv50, q, wait, result);
>> + }
>> +
>> if (q->state != NV50_QUERY_STATE_READY) {
>> if (!wait) {
>> /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
>> @@ -476,6 +520,1018 @@ nva0_so_target_save_offset(struct pipe_context
>> *pipe,
>> nv50_query_end(pipe, targ->pq);
>> }
>> +/* === HARDWARE GLOBAL PERFORMANCE COUNTERS for NV50 === */
>> +
>> +struct nv50_hw_pm_source_cfg
>> +{
>> + const char *name;
>> + uint64_t value;
>> +};
>> +
>> +struct nv50_hw_pm_signal_cfg
>> +{
>> + const char *name;
>> + const struct nv50_hw_pm_source_cfg src[8];
>> +};
>> +
>> +struct nv50_hw_pm_counter_cfg
>> +{
>> + uint16_t logic_op;
>> + const struct nv50_hw_pm_signal_cfg sig[4];
>> +};
>> +
>> +enum nv50_hw_pm_query_display
>> +{
>> + NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> +};
>> +
>> +enum nv50_hw_pm_query_count
>> +{
>> + NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + NV50_HW_PM_EVENT_COUNT_B4,
>> + NV50_HW_PM_EVENT_COUNT_B6,
>> +};
>> +
>> +struct nv50_hw_pm_event_cfg
>> +{
>> + const char *name;
>> + const char *desc;
>> + enum nv50_hw_pm_query_display display;
>> + enum nv50_hw_pm_query_count count;
>> + uint8_t domain;
>> +};
>> +
>> +struct nv50_hw_pm_query_cfg
>> +{
>> + const struct nv50_hw_pm_event_cfg *event;
>> + const struct nv50_hw_pm_counter_cfg ctr[4];
>> +};
>> +
>> +#define SRC(name, val) { name, val }
>> +#define SIG(name, ...) { name, { __VA_ARGS__ } }
>> +#define CTR(func, ...) { func, { __VA_ARGS__ } }
>> +
>> +/*
>> + * GPU
>> + */
>> +/* gpu_idle */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_gpu_idle_event =
>> +{
>> + .name = "gpu_idle",
>> + .desc = "The percentage of time the GPU is idle/busy since the
>> last "
>> + "call. Having the GPU idle at all is a waste of
>> valuable "
>> + "resources. You want to balance the GPU and CPU
>> workloads so "
>> + "that no one processor is starved for work. Time
>> management or "
>> + "using multithreading in your application can help
>> balance CPU "
>> + "based tasks (world management, etc.) with the
>> rendering "
>> + "pipeline.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_gpu_idle_query =
>> +{
>> + .event = &nv50_gpu_idle_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_gr_idle")),
>> +};
>> +
>> +/*
>> + * INPUT ASSEMBLER
>> + */
>> +/* input_assembler_busy */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_ia_busy_event =
>> +{
>> + .name = "input_assembler_busy",
>> + .desc = "The percentage of time the input assembler unit is
>> busy. This "
>> + "is mainly impacted by both the number of vertices
>> processed as "
>> + "well as the size of the attributes on those vertices.
>> You can "
>> + "optimize this by reducing vertex size as much as
>> possible and "
>> + "using indexed primitives to take advantage of the
>> vertex cache.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_ia_busy_query =
>> +{
>> + .event = &nv50_ia_busy_event,
>> + .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_18",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
>> + SIG("pc01_vfetch_17"),
>> + SIG("pc01_vfetch_03"),
>> + SIG("pc01_vfetch_02")),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nva0_ia_busy_query =
>> +{
>> + .event = &nv50_ia_busy_event,
>> + .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_15",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
>> + SIG("pc01_vfetch_14"),
>> + SIG("pc01_vfetch_03"),
>> + SIG("pc01_vfetch_02")),
>> +};
>> +
>> +/* input_assembler_waits_for_fb */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_ia_waits_for_fb_event = {
>> + .name = "input_assembler_waits_for_fb",
>> + .desc = "This is the amount of time the input assembler unit
>> was "
>> + "waiting for data from the frame buffer unit.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_ia_waits_for_fb_query =
>> +{
>> + .event = &nv50_ia_waits_for_fb_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_vfetch_0e",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nva0_ia_waits_for_fb_query =
>> +{
>> + .event = &nv50_ia_waits_for_fb_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_vfetch_0b",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
>> +};
>> +
>> +/* vertex_attribute_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_vertex_attr_count_event =
>> +{
>> + .name = "vertex_attribute_count",
>> + .desc = "The number of vertex attributes that are fetched and
>> passed to "
>> + "the geometry unit is returned in this counter. A
>> large number "
>> + "of attributes (or unaligned vertices) can hurt vertex
>> cache "
>> + "performance and reduce the overall vertex processing "
>> + "capabilities of the pipeline.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_vertex_attr_count_query =
>> +{
>> + .event = &nv50_vertex_attr_count_event,
>> + .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_18",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
>> + SIG("pc01_vfetch_17"),
>> + SIG("pc01_vfetch_03"),
>> + SIG("pc01_vfetch_02")),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nva0_vertex_attr_count_query =
>> +{
>> + .event = &nv50_vertex_attr_count_event,
>> + .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_15",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
>> + SIG("pc01_vfetch_14"),
>> + SIG("pc01_vfetch_03"),
>> + SIG("pc01_vfetch_02")),
>> +};
>> +
>> +/*
>> + * GEOM
>> + */
>> +/* geom_vertex_in_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_geom_vertex_in_count_event =
>> +{
>> + .name = "geom_vertex_in_count",
>> + .desc = "The number of vertices input to the geom unit.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_B4,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_geom_vertex_in_count_query =
>> +{
>> + .event = &nv50_geom_vertex_in_count_event,
>> + .ctr[1] = CTR(0xffff, SIG("pc01_vfetch_0e",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x0)),
>> + SIG("pc01_vfetch_0f"),
>> + SIG("pc01_vfetch_10"),
>> + SIG("pc01_trailer")),
>> + .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
>> + SIG("pc01_trailer"),
>> + SIG("pc01_trailer"),
>> + SIG("pc01_trailer")),
>> +};
>> +
>> +/* geom_vertex_out_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_geom_vertex_out_count_event =
>> +{
>> + .name = "geom_vertex_out_count",
>> + .desc = "The number of vertices coming out of the geom unit
>> after any "
>> + "geometry shader expansion.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_geom_vertex_out_count_query =
>> +{
>> + .event = &nv50_geom_vertex_out_count_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_01")),
>> +};
>> +
>> +/* geom_primitive_in_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_geom_primitive_in_count_event =
>> +{
>> + .name = "geom_primitive_in_count",
>> + .desc = "The number of primitives input to the geom unit.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_geom_primitive_in_count_query =
>> +{
>> + .event = &nv50_geom_primitive_in_count_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_vfetch_08",
>> + SRC("pgraph_vfetch_unk0c_unk0", 0x0))),
>> +};
>> +
>> +/* geom_primitive_out_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_geom_primitive_out_count_event =
>> +{
>> + .name = "geom_primitive_out_count",
>> + .desc = "The number of primitives coming out the geom unit
>> after any "
>> + "geometry shader expansion.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_geom_primitive_out_count_query =
>> +{
>> + .event = &nv50_geom_primitive_out_count_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_00")),
>> +};
>> +
>> +/*
>> + * STREAM OUT
>> + */
>> +/* stream_out_busy */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_so_busy_event =
>> +{
>> + .name = "stream_out_busy",
>> + .desc = "This unit manages the writing of vertices to the
>> frame buffer "
>> + "when using stream out. If a significant number of
>> vertices are "
>> + "written, this can become a bottleneck.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_so_busy_query =
>> +{
>> + .event = &nv50_so_busy_event,
>> + .ctr[0] = CTR(0x8888, SIG("pc01_strmout_00"),
>> + SIG("pc01_strmout_01")),
>> +};
>> +
>> +/*
>> + * SETUP
>> + */
>> +/* setup_primitive_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_setup_primitive_count_event =
>> +{
>> + .name = "setup_primitive_count",
>> + .desc = "Returns the number of primitives processed in the
>> geometry "
>> + "subsystem. This experiments counts points, lines and
>> triangles. "
>> + "To count only triangles, use the setup_triangle_count
>> counter. "
>> + "Balance these counts with the number of pixels being
>> drawn to "
>> + "see if you could simplify your geometry and use "
>> + "bump/displacement maps, for example.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_setup_primitive_count_query =
>> +{
>> + .event = &nv50_setup_primitive_count_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_trast_00")),
>> +};
>> +
>> +/* setup_point_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_setup_point_count_event =
>> +{
>> + .name = "setup_point_count",
>> + .desc = "The number of points seen by the primitive setup unit
>> (just "
>> + "before rasterization).",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_setup_point_count_query =
>> +{
>> + .event = &nv50_setup_point_count_event,
>> + .ctr[0] = CTR(0x8080, SIG("pc01_trast_01"),
>> + SIG("pc01_trast_04"),
>> + SIG("pc01_trast_05")),
>> +};
>> +
>> +/* setup_line_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_setup_line_count_event =
>> +{
>> + .name = "setup_line_count",
>> + .desc = "The number of lines seen by the primitive setup unit
>> (just "
>> + "before rasterization).",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_setup_line_count_query =
>> +{
>> + .event = &nv50_setup_line_count_event,
>> + .ctr[0] = CTR(0x8080, SIG("pc01_trast_02"),
>> + SIG("pc01_trast_04"),
>> + SIG("pc01_trast_05")),
>> +};
>> +
>> +/* setup_triangle_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_setup_triangle_count_event =
>> +{
>> + .name = "setup_triangle_count",
>> + .desc = "Returns the number of triangles processed in the
>> geometry "
>> + "subsystem.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_setup_triangle_count_query =
>> +{
>> + .event = &nv50_setup_triangle_count_event,
>> + .ctr[0] = CTR(0x8080, SIG("pc01_trast_03"),
>> + SIG("pc01_trast_04"),
>> + SIG("pc01_trast_05")),
>> +};
>> +
>> +/* setup_primitive_culled_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_setup_primitive_culled_count_event =
>> +{
>> + .name = "setup_primitive_culled_count",
>> + .desc = "Returns the number of primitives culled in primitive
>> setup. If "
>> + "you are performing viewport culling, this gives you an "
>> + "indication of the accuracy of the algorithm being
>> used, and can "
>> + "give you and idea if you need to improves this
>> culling. This "
>> + "includes primitives culled when using backface
>> culling. Drawing "
>> + "a fully visible sphere on the screen should cull half
>> of the "
>> + "triangles if backface culling is turned on and all the "
>> + "triangles are ordered consistently (CW or CCW).",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_setup_primitive_culled_count_query =
>> +{
>> + .event = &nv50_setup_primitive_culled_count_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc01_unk00")),
>> +};
>> +
>> +/*
>> + * RASTERIZER
>> + */
>> +/* rast_tiles_killed_by_zcull_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rast_tiles_killed_by_zcull_event =
>> +{
>> + .name = "rasterizer_tiles_killed_by_zcull_count",
>> + .desc = "The number of pixels killed by the zcull unit in the
>> rasterizer.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_B6,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rast_tiles_killed_by_zcull_query =
>> +{
>> + .event = &nv50_rast_tiles_killed_by_zcull_event,
>> + .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
>> + SRC("pgraph_zcull_pm_unka4_unk0", 0x7)),
>> + SIG("pc01_zcull_01"),
>> + SIG("pc01_zcull_02"),
>> + SIG("pc01_zcull_03")),
>> + .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
>> + SIG("pc01_trailer"),
>> + SIG("pc01_zcull_04"),
>> + SIG("pc01_zcull_05")),
>> +};
>> +
>> +/* rast_tiles_in_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rast_tiles_in_count_event =
>> +{
>> + .name = "rasterizer_tiles_in_count",
>> + .desc = "Count of tiles (each of which contain 1-8 pixels)
>> seen by the "
>> + "rasterizer stage.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_B6,
>> + .domain = 1,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rast_tiles_in_count_query =
>> +{
>> + .event = &nv50_rast_tiles_in_count_event,
>> + .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
>> + SRC("pgraph_zcull_pm_unka4_unk0", 0x0)),
>> + SIG("pc01_zcull_01"),
>> + SIG("pc01_zcull_02"),
>> + SIG("pc01_zcull_03")),
>> + .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
>> + SIG("pc01_trailer"),
>> + SIG("pc01_zcull_04"),
>> + SIG("pc01_zcull_05")),
>> +};
>> +
>> +/*
>> + * ROP
>> + */
>> +/* rop_busy */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rop_busy_event =
>> +{
>> + .name = "rop_busy",
>> + .desc = "Percentage of time that the ROP unit is actively
>> doing work. "
>> + "This can be high if alpha blending is turned on, of
>> overdraw "
>> + "is high, etc.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rop_busy_query =
>> +{
>> + .event = &nv50_rop_busy_event,
>> + .ctr[0] = CTR(0xf888, SIG("pc02_prop_02",
>> + SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
>> + SIG("pc02_prop_03"),
>> + SIG("pc02_prop_04"),
>> + SIG("pc02_prop_05")),
>> +};
>> +
>> +/* rop_waits_for_fb */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rop_waits_for_fb_event =
>> +{
>> + .name = "rop_waits_for_fb",
>> + .desc = "The amount of time the blending unit spent waiting
>> for data "
>> + "from the frame buffer unit. If blending is enabled
>> and there "
>> + "is a lot of traffic here (since this is a
>> read/modify/write "
>> + "operation) this can become a bottleneck.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rop_waits_for_fb_query =
>> +{
>> + .event = &nv50_rop_waits_for_fb_event,
>> + .ctr[0] = CTR(0x22f2, SIG("pc02_crop_03",
>> + SRC("pgraph_rop0_crop_pm_mux_sel0", 0x0)),
>> + SIG("pc02_crop_02"),
>> + SIG("pc02_zrop_03",
>> + SRC("pgraph_rop0_zrop_pm_mux_sel0", 0x0)),
>> + SIG("pc02_zrop_02")),
>> +};
>> +
>> +/* rop_waits_for_shader */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rop_waits_for_shader_event =
>> +{
>> + .name = "rop_waits_for_shader",
>> + .desc = "This is a measurement of how often the blending unit
>> was "
>> + "waiting on new work (fragments to be placed into the
>> render "
>> + "target). If the pixel shaders are particularly
>> expensive, the "
>> + "ROP unit could be starved waiting for results.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rop_waits_for_shader_query =
>> +{
>> + .event = &nv50_rop_waits_for_shader_event,
>> + .ctr[0] = CTR(0x2222, SIG("pc02_prop_6",
>> + SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
>> + SIG("pc02_prop_7")),
>> +};
>> +
>> +/* rop_samples_killed_by_earlyz_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rop_samples_killed_by_earlyz_event =
>> +{
>> + .name = "rop_samples_killed_by_earlyz_count",
>> + .desc = "This returns the number of pixels that were killed in
>> the "
>> + "earlyZ hardware. This signal will give you an idea
>> of, for "
>> + "instance, a Z only pass was successful in setting up
>> the depth "
>> + "buffer.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_B6,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rop_samples_killed_by_earlyz_query =
>> +{
>> + .event = &nv50_rop_samples_killed_by_earlyz_event,
>> + .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
>> + SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1a)),
>> + SIG("pc02_prop_01"),
>> + SIG("pc02_prop_02"),
>> + SIG("pc02_prop_03")),
>> + .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
>> + SIG("pc02_trailer"),
>> + SIG("pc02_prop_04"),
>> + SIG("pc02_prop_05")),
>> +};
>> +
>> +/* rop_samples_killed_by_latez_count */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_rop_samples_killed_by_latez_event =
>> +{
>> + .name = "rop_samples_killed_by_latez_count",
>> + .desc = "This returns the number of pixels that were killed
>> after the "
>> + "pixel shader ran. This can happen if the early Z is
>> unable to "
>> + "cull the pixel because of an API setup issue like
>> changing the "
>> + "Z direction or modifying Z in the pixel shader.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_B6,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_rop_samples_killed_by_latez_query =
>> +{
>> + .event = &nv50_rop_samples_killed_by_latez_event,
>> + .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
>> + SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1b)),
>> + SIG("pc02_prop_01"),
>> + SIG("pc02_prop_02"),
>> + SIG("pc02_prop_03")),
>> + .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
>> + SIG("pc02_trailer"),
>> + SIG("pc02_prop_04"),
>> + SIG("pc02_prop_05")),
>> +};
>> +
>> +/*
>> + * TEXTURE
>> + */
>> +/* tex_cache_miss */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_tex_cache_miss_event =
>> +{
>> + .name = "tex_cache_miss",
>> + .desc = "Number of texture cache misses.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_tex_cache_miss_query =
>> +{
>> + .event = &nv50_tex_cache_miss_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",
>> + SRC("pgraph_tpc0_tex_unk08_unk0",
>> 0x200))),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv84_tex_cache_miss_query =
>> +{
>> + .event = &nv50_tex_cache_miss_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",
>> + SRC("pgraph_tpc0_tex_unk08_unk0",
>> 0x800))),
>> +};
>> +
>> +/* tex_cache_hit */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_tex_cache_hit_event =
>> +{
>> + .name = "tex_cache_hit",
>> + .desc = "Number of texture cache hits.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_tex_cache_hit_query =
>> +{
>> + .event = &nv50_tex_cache_hit_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",
>> + SRC("pgraph_tpc0_tex_unk08_unk0",
>> 0x200))),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv84_tex_cache_hit_query =
>> +{
>> + .event = &nv50_tex_cache_hit_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",
>> + SRC("pgraph_tpc0_tex_unk08_unk0",
>> 0x800))),
>> +};
>> +
>> +/* tex_waits_for_fb */
>> +static const struct nv50_hw_pm_event_cfg
>> +nv50_tex_waits_for_fb_event =
>> +{
>> + .name = "tex_waits_for_fb",
>> + .desc = "This is the amount of time the texture unit spent
>> waiting on "
>> + "samples to return from the frame buffer unit. It is a
>> potential "
>> + "indication of poor texture cache utilization.",
>> + .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
>> + .count = NV50_HW_PM_EVENT_COUNT_SIMPLE,
>> + .domain = 2,
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv50_tex_waits_for_fb_query =
>> +{
>> + .event = &nv50_tex_waits_for_fb_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",
>> + SRC("pgraph_tpc0_tex_unk08_unk0",
>> 0x200))),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> +nv84_tex_waits_for_fb_query =
>> +{
>> + .event = &nv50_tex_waits_for_fb_event,
>> + .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",
>> + SRC("pgraph_tpc0_tex_unk08_unk0",
>> 0x800))),
>> +};
>> +
>> +static const struct nv50_hw_pm_query_cfg
>> *nv50_hw_pm_queries[NV50_HW_PM_QUERY_COUNT];
>> +
>> +#define _Q(n, q) nv50_hw_pm_queries[NV50_HW_PM_QUERY_##n] = &q;
>> +
>> +static void
>> +nv50_identify_events(struct nv50_screen *screen)
>> +{
>> + _Q(GPU_IDLE, nv50_gpu_idle_query);
>> + _Q(IA_BUSY, nv50_ia_busy_query);
>> + _Q(IA_WAITS_FOR_FB, nv50_ia_waits_for_fb_query);
>> + _Q(VERTEX_ATTR_COUNT, nv50_vertex_attr_count_query);
>> + _Q(GEOM_VERTEX_IN_COUNT, nv50_geom_vertex_in_count_query);
>> + _Q(GEOM_VERTEX_OUT_COUNT, nv50_geom_vertex_out_count_query);
>> + _Q(GEOM_PRIMITIVE_IN_COUNT, nv50_geom_primitive_in_count_query);
>> + _Q(GEOM_PRIMITIVE_OUT_COUNT, nv50_geom_primitive_out_count_query);
>> + _Q(SO_BUSY, nv50_so_busy_query);
>> + _Q(SETUP_PRIMITIVE_COUNT, nv50_setup_primitive_count_query);
>> + _Q(SETUP_POINT_COUNT, nv50_setup_point_count_query);
>> + _Q(SETUP_LINE_COUNT, nv50_setup_line_count_query);
>> + _Q(SETUP_TRIANGLE_COUNT, nv50_setup_triangle_count_query);
>> + _Q(SETUP_PRIMITIVE_CULLED_COUNT,
>> nv50_setup_primitive_culled_count_query);
>> + _Q(RAST_TILES_KILLED_BY_ZCULL,
>> nv50_rast_tiles_killed_by_zcull_query);
>> + _Q(RAST_TILES_IN_COUNT, nv50_rast_tiles_in_count_query);
>> + _Q(ROP_BUSY, nv50_rop_busy_query);
>> + _Q(ROP_WAITS_FOR_FB, nv50_rop_waits_for_fb_query);
>> + _Q(ROP_WAITS_FOR_SHADER, nv50_rop_waits_for_shader_query);
>> + _Q(ROP_SAMPLES_KILLED_BY_EARLYZ,
>> nv50_rop_samples_killed_by_earlyz_query);
>> + _Q(ROP_SAMPLES_KILLED_BY_LATEZ,
>> nv50_rop_samples_killed_by_latez_query );
>> + _Q(TEX_CACHE_MISS, nv50_tex_cache_miss_query);
>> + _Q(TEX_CACHE_HIT, nv50_tex_cache_hit_query);
>> + _Q(TEX_WAITS_FOR_FB, nv50_tex_waits_for_fb_query);
>> +
>> + if (screen->base.class_3d >= NV84_3D_CLASS) {
>> + /* Variants for NV84+ */
>> + _Q(TEX_CACHE_MISS, nv84_tex_cache_miss_query);
>> + _Q(TEX_CACHE_HIT, nv84_tex_cache_hit_query);
>> + _Q(TEX_WAITS_FOR_FB, nv84_tex_waits_for_fb_query);
>> + }
>> +
>> + if (screen->base.class_3d >= NVA0_3D_CLASS) {
>> + /* Variants for NVA0+ */
>> + _Q(IA_BUSY, nva0_ia_busy_query);
>> + _Q(IA_WAITS_FOR_FB, nva0_ia_waits_for_fb_query);
>> + _Q(VERTEX_ATTR_COUNT, nva0_vertex_attr_count_query);
>> + }
>> +}
>> +
>> +#undef _Q
>> +
>> +#ifdef DEBUG
> Same as above, get rid of this test.
>> +static void
>> +nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args)
>> +{
>> + int i, j, k;
>> +
>> + debug_printf("PERFDOM CONFIGURATION:\n");
>> + debug_printf("domaine: 0x%02x\n", args->domain);
>> + debug_printf("mode: 0x%02x\n", args->mode);
>> + for (i = 0; i < 4; i++) {
>> + uint32_t signal = 0;
>> + for (j = 0; j < 4; j++)
>> + signal |= args->ctr[i].signal[j] << (j * 8);
>> +
>> + debug_printf("ctr[%d]: func = 0x%04x, signal=0x%08x\n",
>> + i, args->ctr[i].logic_op, signal);
>> +
>> + for (j = 0; j < 4; j++) {
>> + for (k = 0; k < 8; k++) {
>> + uint32_t source, value;
>> + if (!args->ctr[i].source[j][k])
>> + continue;
>> +
>> + source = args->ctr[i].source[j][k];
>> + value = args->ctr[i].source[j][k] >> 32;
>> + debug_printf(" src[%d][%d]: source = 0x%08x, value =
>> 0x%08x\n",
>> + j, k, source, value);
>> + }
>> + }
>> + }
>> +}
>> +#endif
>> +
>> +static const struct nv50_hw_pm_query_cfg *
>> +nv50_hw_pm_query_get_cfg(struct nv50_screen *screen, uint32_t
>> query_type)
>> +{
>> + return nv50_hw_pm_queries[query_type - NV50_HW_PM_QUERY(0)];
> No check that you have a valid query_type? As in, query_type -
> NV50_HW_PM_QUERY(0) <= NV50_HW_PM_QUERY_LAST).
This should never happen because we don't allow to create a query with
an invalid type.
Same behaviour on nvc0, btw.
>> +}
>> +
>> +static boolean
>> +nv50_hw_pm_query_create(struct nv50_context *nv50, struct nv50_query
>> *q)
>> +{
>> + struct nv50_screen *screen = nv50->screen;
>> + struct nouveau_perfmon *perfmon = screen->base.perfmon;
>> + static const struct nv50_hw_pm_query_cfg *cfg;
>> + struct nvif_perfdom_v0 args = {};
>> + struct nouveau_perfmon_dom *dom;
>> + int i, j, k;
>> + int ret;
>> +
>> + if (!screen->pm.num_active) {
>> + /* TODO: Currently, only one query type can be monitored
>> simultaneously
>> + * because the Gallium's HUD doesn't fit well with the perfdom
>> interface.
>> + *
>> + * With two different query types, the current scenario is as
>> follows:
>> + * CREATE Q1, BEGIN Q1, CREATE Q2, BEGIN Q2, END Q1, RESULT
>> Q1, BEGIN Q1,
>> + * END Q2, RESULT Q2, BEGIN Q2, END Q1, and so on.
>> + *
>> + * This behaviour doesn't allow to schedule multiple counters
>> because
>> + * we have to do that at query creation (ie. when a perfdom is
>> created).
>> + *
>> + * To get rid of this limitation, a better scenario would be:
>> + * CREATE Q1, CREATE Q2, BEGIN Q1, BEGIN Q2, END Q1, END Q2,
>> RESULT Q1,
>> + * RESULT Q2, BEGIN Q1, BEGIN Q2, END Q1, and so on.
>> + *
>> + * With this kind of behaviour, we could introduce
>> + * {create,begin,end}_all_queries() functions to be able to
>> configure
>> + * all queries in one shot.
>> + */
>> + screen->pm.query_type = q->type;
>> + }
>> + screen->pm.num_active++;
>> +
>> + if (screen->pm.query_type != q->type) {
>> + NOUVEAU_ERR("Only one query type can be monitored at the same
>> time!");
>> + return FALSE;
>> + }
>> +
>> + cfg = nv50_hw_pm_query_get_cfg(nv50->screen, q->type);
>> +
>> + dom = nouveau_perfmon_get_dom_by_id(perfmon, cfg->event->domain);
>> + if (!dom) {
>> + NOUVEAU_ERR("Failed to find domain %d\n", cfg->event->domain);
>> + return FALSE;
>> + }
>> +
>> + /* configure domain and counting mode */
>> + args.domain = dom->id;
>> + args.mode = cfg->event->count;
>> +
>> + /* configure counters for this hardware event */
>> + for (i = 0; i < ARRAY_SIZE(cfg->ctr); i++) {
>> + const struct nv50_hw_pm_counter_cfg *sctr = &cfg->ctr[i];
>> +
>> + if (!sctr->logic_op)
>> + continue;
>> + args.ctr[i].logic_op = sctr->logic_op;
>> +
>> + /* configure signals for this counter */
>> + for (j = 0; j < ARRAY_SIZE(sctr->sig); j++) {
>> + const struct nv50_hw_pm_signal_cfg *ssig = &sctr->sig[j];
>> + struct nouveau_perfmon_sig *sig;
>> +
>> + if (!ssig->name)
>> + continue;
>> +
>> + sig = nouveau_perfmon_get_sig_by_name(dom, ssig->name);
>> + if (!sig) {
>> + NOUVEAU_ERR("Failed to find signal %s\n", ssig->name);
>> + return FALSE;
>> + }
>> + args.ctr[i].signal[j] = sig->signal;
>> +
>> + /* configure sources for this signal */
>> + for (k = 0; k < ARRAY_SIZE(ssig->src); k++) {
>> + const struct nv50_hw_pm_source_cfg *ssrc = &ssig->src[k];
>> + struct nouveau_perfmon_src *src;
>> +
>> + if (!ssrc->name)
>> + continue;
>> +
>> + src = nouveau_perfmon_get_src_by_name(sig, ssrc->name);
>> + if (!src) {
>> + NOUVEAU_ERR("Failed to find source %s\n", ssrc->name);
>> + return FALSE;
>> + }
>> + args.ctr[i].source[j][k] = (ssrc->value << 32) | src->id;
>> + }
>> + }
>> + }
>> +
>> +#ifdef DEBUG
>> + if (debug_get_num_option("NV50_PM_DEBUG", 0))
>> + nv50_hw_pm_dump_perfdom(&args);
>> +#endif
>> +
>> + ret = nouveau_object_new(perfmon->object, perfmon->handle++,
>> + NVIF_IOCTL_NEW_V0_PERFDOM,
>> + &args, sizeof(args), &q->perfdom);
>> + if (ret) {
>> + NOUVEAU_ERR("Failed to create perfdom object: %d\n", ret);
>> + return FALSE;
>> + }
>> +
>> + return TRUE;
>> +}
>> +
>> +static void
>> +nv50_hw_pm_query_destroy(struct nv50_context *nv50, struct
>> nv50_query *q)
>> +{
>> + struct nv50_screen *screen = nv50->screen;
>> +
>> + nouveau_object_del(&q->perfdom);
>> + screen->pm.num_active--;
>> +}
>> +
>> +static boolean
>> +nv50_hw_pm_query_begin(struct nv50_context *nv50, struct nv50_query *q)
>> +{
>> + struct nouveau_pushbuf *push = nv50->base.pushbuf;
>> +
>> + /* start the next batch of counters */
>> + PUSH_SPACE(push, 2);
>> + BEGIN_NV04(push, SUBC_SW(0x0608), 1);
> Put this sw method in libdrm?
>> + PUSH_DATA (push, q->perfdom->handle);
>> +
>> + return TRUE;
>> +}
>> +
>> +static void
>> +nv50_hw_pm_query_end(struct nv50_context *nv50, struct nv50_query *q)
>> +{
>> + struct nouveau_pushbuf *push = nv50->base.pushbuf;
>> + struct nv50_screen *screen = nv50->screen;
>> +
>> + /* set sequence field (used to check if result is available) */
>> + q->sequence = ++screen->pm.sequence;
>> +
>> + /* sample the previous batch of counters */
>> + PUSH_SPACE(push, 4);
>> + BEGIN_NV04(push, SUBC_SW(0x060c), 1);
>> + PUSH_DATA (push, q->perfdom->handle);
>> +
>> + /* read back counters values */
>> + BEGIN_NV04(push, SUBC_SW(0x0700), 1);
>> + PUSH_DATA (push, screen->pm.sequence);
>> +}
>> +
>> +static volatile void *
>> +nv50_ntfy(struct nv50_screen *screen)
>> +{
>> + struct nv04_notify *query = screen->query->data;
>> + struct nouveau_bo *notify = screen->notify_bo;
>> +
>> + return (char *)notify->map + query->offset;
>> +}
>> +
>> +static INLINE uint32_t
>> +nv50_hw_pm_query_get_offset(struct nv50_query *q)
>> +{
>> + return (1 + (q->sequence % NV50_HW_PM_RING_BUFFER_MAX_QUERIES) *
>> + NV50_HW_PM_RING_BUFFER_NUM_DOMAINS * 6);
>> +}
>> +
>> +static INLINE boolean
>> +nv50_hw_pm_query_read_data(struct nv50_context *nv50, struct
>> nv50_query *q,
>> + boolean wait, uint32_t ctr[4], uint32_t
>> *clk)
>> +{
>> + volatile uint32_t *ntfy = nv50_ntfy(nv50->screen);
>> + uint32_t offset = nv50_hw_pm_query_get_offset(q);
>> + boolean found = FALSE;
>> + int i;
>> +
>> + if (q->state != NV50_QUERY_STATE_READY) {
>> + if (!wait)
>> + return FALSE;
>> + if (!nouveau_fence_wait(q->fence))
>> + return FALSE;
>> + }
>> +
>> + if (ntfy[0] > q->sequence + NV50_HW_PM_RING_BUFFER_MAX_QUERIES -
>> 1) {
>> + /* Results in the ring buffer are too old, throw away that
>> query. */
>> + return FALSE;
>> + }
>> +
>> + for (i = 0; i < NV50_HW_PM_RING_BUFFER_NUM_DOMAINS; i++) {
>> + if (ntfy[offset + i * 6] == q->perfdom->handle) {
>> + found = TRUE;
>> + break;
>> + }
>> + }
>> +
>> + if (!found) {
>> + NOUVEAU_ERR("Failed to find perfdom object %" PRIu64 "!\n",
>> + q->perfdom->handle);
>> + return FALSE;
>> + }
>> +
>> + for (i = 0; i < 4; i++)
>> + ctr[i] = ntfy[offset + i + 1];
>> + *clk = ntfy[offset + 5];
>> +
>> + return TRUE;
>> +}
>> +
>> +static boolean
>> +nv50_hw_pm_query_result(struct nv50_context *nv50, struct nv50_query
>> *q,
>> + boolean wait, void *result)
>> +{
>> + struct nv50_screen *screen = nv50->screen;
>> + const struct nv50_hw_pm_query_cfg *cfg;
>> + uint32_t ctr[4], clk;
>> + uint64_t value = 0;
>> + int ret;
>> +
>> + ret = nv50_hw_pm_query_read_data(nv50, q, wait, ctr, &clk);
>> + if (!ret)
>> + return FALSE;
>> +
>> + cfg = nv50_hw_pm_query_get_cfg(screen, q->type);
>> + if (cfg->event->count == NV50_HW_PM_EVENT_COUNT_SIMPLE) {
>> + /* SIMPLE hardware events are sampled on PRE_CTR. */
>> + value = ctr[0];
>> + } else {
>> + /* EVENT_B4/EVENT_B6 hardware events are sampled on EVENT_CTR. */
>> + value = ctr[2];
>> + }
>> +
>> + if (cfg->event->display == NV50_HW_PM_EVENT_DISPLAY_RATIO) {
>> + if (clk)
>> + value = (value * 100) / (float)clk;
>> + }
>> +
>> + fprintf(stderr, "ctr[0]=%d, ctr[1]=%d, ctr[2]=%d, ctr[3]=%d,
>> clk=%d, val=%d\n",
>> + ctr[0], ctr[1], ctr[2], ctr[3], clk, value);
> The above is likely to be a leftover, right? :p
For debugging purposes, I'll remove it for the final version. ;)
>> +
>> + *(uint64_t *)result = value;
>> + return TRUE;
>> +}
>> +
>> void
>> nv50_init_query_functions(struct nv50_context *nv50)
>> {
>> diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
>> b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
>> index 71a5247..0449659 100644
>> --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
>> +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
>> @@ -89,6 +89,12 @@ struct nv50_screen {
>> struct nouveau_bo *bo;
>> } fence;
>> + struct {
>> + uint32_t sequence;
>> + uint32_t query_type;
>> + uint32_t num_active;
>> + } pm;
>> +
>> struct nouveau_object *sync;
>> struct nouveau_object *query;
>> @@ -108,6 +114,35 @@ nv50_screen(struct pipe_screen *screen)
>> return (struct nv50_screen *)screen;
>> }
>> +/* Hardware global performance counters. */
>> +#define NV50_HW_PM_QUERY_COUNT 24
>> +#define NV50_HW_PM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i))
>> +#define NV50_HW_PM_QUERY_LAST
>> NV50_HW_PM_QUERY(NV50_HW_PM_QUERY_COUNT - 1)
>> +#define NV50_HW_PM_QUERY_GPU_IDLE 0
>> +#define NV50_HW_PM_QUERY_IA_BUSY 1
>> +#define NV50_HW_PM_QUERY_IA_WAITS_FOR_FB 2
>> +#define NV50_HW_PM_QUERY_VERTEX_ATTR_COUNT 3
>> +#define NV50_HW_PM_QUERY_GEOM_VERTEX_IN_COUNT 4
>> +#define NV50_HW_PM_QUERY_GEOM_VERTEX_OUT_COUNT 5
>> +#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_IN_COUNT 6
>> +#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_OUT_COUNT 7
>> +#define NV50_HW_PM_QUERY_SO_BUSY 8
>> +#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_COUNT 9
>> +#define NV50_HW_PM_QUERY_SETUP_POINT_COUNT 10
>> +#define NV50_HW_PM_QUERY_SETUP_LINE_COUNT 11
>> +#define NV50_HW_PM_QUERY_SETUP_TRIANGLE_COUNT 12
>> +#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_CULLED_COUNT 13
>> +#define NV50_HW_PM_QUERY_RAST_TILES_KILLED_BY_ZCULL 14
>> +#define NV50_HW_PM_QUERY_RAST_TILES_IN_COUNT 15
>> +#define NV50_HW_PM_QUERY_ROP_BUSY 16
>> +#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_FB 17
>> +#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_SHADER 18
>> +#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_EARLYZ 19
>> +#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_LATEZ 20
>> +#define NV50_HW_PM_QUERY_TEX_CACHE_MISS 21
>> +#define NV50_HW_PM_QUERY_TEX_CACHE_HIT 22
>> +#define NV50_HW_PM_QUERY_TEX_WAITS_FOR_FB 23
>> +
>> boolean nv50_blitter_create(struct nv50_screen *);
>> void nv50_blitter_destroy(struct nv50_screen *);
> Congrats, it looks really clean! With the above fixed, this patch is
> Reviewed-by: Martin Peres <martin.peres at free.fr>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list