[Mesa-dev] [PATCH] nvc0: add support for performance monitoring metrics on Fermi

Ilia Mirkin imirkin at alum.mit.edu
Fri Oct 16 14:57:02 PDT 2015


On Fri, Oct 16, 2015 at 5:35 PM, Samuel Pitoiset
<samuel.pitoiset at gmail.com> wrote:
>
>
> On 10/16/2015 11:22 PM, Ilia Mirkin wrote:
>>
>> On Fri, Oct 16, 2015 at 5:29 PM, Samuel Pitoiset
>> <samuel.pitoiset at gmail.com> wrote:
>>>
>>> As explained in the CUDA toolkit documentation, "a metric is a
>>> characteristic of an application that is calculated from one or more
>>> event values."
>>>
>>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>>> ---
>>>   src/gallium/drivers/nouveau/Makefile.sources       |   2 +
>>>   src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c   |  19 +-
>>>   .../drivers/nouveau/nvc0/nvc0_query_hw_metric.c    | 444
>>> +++++++++++++++++++++
>>>   .../drivers/nouveau/nvc0/nvc0_query_hw_metric.h    |  42 ++
>>>   4 files changed, 504 insertions(+), 3 deletions(-)
>>>   create mode 100644
>>> src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>>>   create mode 100644
>>> src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h
>>>
>>> diff --git a/src/gallium/drivers/nouveau/Makefile.sources
>>> b/src/gallium/drivers/nouveau/Makefile.sources
>>> index edc6cf4..c18e9f5 100644
>>> --- a/src/gallium/drivers/nouveau/Makefile.sources
>>> +++ b/src/gallium/drivers/nouveau/Makefile.sources
>>> @@ -154,6 +154,8 @@ NVC0_C_SOURCES := \
>>>          nvc0/nvc0_query.h \
>>>          nvc0/nvc0_query_hw.c \
>>>          nvc0/nvc0_query_hw.h \
>>> +       nvc0/nvc0_query_hw_metric.c \
>>> +       nvc0/nvc0_query_hw_metric.h \
>>>          nvc0/nvc0_query_hw_sm.c \
>>>          nvc0/nvc0_query_hw_sm.h \
>>>          nvc0/nvc0_query_sw.c \
>>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
>>> b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
>>> index 91254be..90ee82f 100644
>>> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
>>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
>>> @@ -25,6 +25,7 @@
>>>
>>>   #include "nvc0/nvc0_context.h"
>>>   #include "nvc0/nvc0_query_hw.h"
>>> +#include "nvc0/nvc0_query_hw_metric.h"
>>>   #include "nvc0/nvc0_query_hw_sm.h"
>>>
>>>   #define NVC0_HW_QUERY_STATE_READY   0
>>> @@ -371,6 +372,12 @@ nvc0_hw_create_query(struct nvc0_context *nvc0,
>>> unsigned type, unsigned index)
>>>         return (struct nvc0_query *)hq;
>>>      }
>>>
>>> +   hq = nvc0_hw_metric_create_query(nvc0, type);
>>> +   if (hq) {
>>> +      hq->base.funcs = &hw_query_funcs;
>>> +      return (struct nvc0_query *)hq;
>>> +   }
>>> +
>>>      hq = CALLOC_STRUCT(nvc0_hw_query);
>>>      if (!hq)
>>>         return NULL;
>>> @@ -435,14 +442,20 @@ int
>>>   nvc0_hw_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
>>>                                 struct pipe_driver_query_info *info)
>>>   {
>>> -   int num_hw_sm_queries = 0;
>>> +   int num_hw_sm_queries = 0, num_hw_metric_queries = 0;
>>>
>>>      num_hw_sm_queries = nvc0_hw_sm_get_driver_query_info(screen, 0,
>>> NULL);
>>> +   num_hw_metric_queries =
>>> +      nvc0_hw_metric_get_driver_query_info(screen, 0, NULL);
>>>
>>>      if (!info)
>>> -      return num_hw_sm_queries;
>>> +      return num_hw_sm_queries + num_hw_metric_queries;
>>> +
>>> +   if (id < num_hw_sm_queries)
>>> +      return nvc0_hw_sm_get_driver_query_info(screen, id, info);
>>>
>>> -   return nvc0_hw_sm_get_driver_query_info(screen, id, info);
>>> +   return nvc0_hw_metric_get_driver_query_info(screen,
>>> +                                               id - num_hw_sm_queries,
>>> info);
>>>   }
>>>
>>>   void
>>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>>> b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>>> new file mode 100644
>>> index 0000000..dbe350a
>>> --- /dev/null
>>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>>> @@ -0,0 +1,444 @@
>>> +/*
>>> + * Copyright 2015 Samuel Pitoiset
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining
>>> a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be
>>> included in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>>> SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
>>> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> + * OTHER DEALINGS IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include "nvc0/nvc0_context.h"
>>> +#include "nvc0/nvc0_query_hw_metric.h"
>>> +#include "nvc0/nvc0_query_hw_sm.h"
>>> +
>>> +/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */
>>> +static const char *nvc0_hw_metric_names[] =
>>> +{
>>> +   "metric-achieved_occupancy",
>>> +   "metric-branch_efficiency",
>>> +   "metric-inst_issued",
>>> +   "metric-inst_per_wrap",
>>> +   "metric-inst_replay_overhead",
>>> +   "metric-issued_ipc",
>>> +   "metric-issue_slots",
>>> +   "metric-issue_slot_utilization",
>>> +   "metric-ipc",
>>> +};
>>> +
>>> +struct nvc0_hw_metric_query_cfg {
>>> +   uint32_t queries[8];
>>> +   uint32_t num_queries;
>>> +};
>>> +
>>> +#define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
>>> +#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c
>>> +
>>> +/* ==== Compute capability 2.0 (GF100/GF110) ==== */
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm20_achieved_occupancy =
>>> +{
>>> +   .queries[0]  = _SM(ACTIVE_WARPS),
>>> +   .queries[1]  = _SM(ACTIVE_CYCLES),
>>> +   .num_queries = 2,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm20_branch_efficiency =
>>> +{
>>> +   .queries[0]  = _SM(BRANCH),
>>> +   .queries[1]  = _SM(DIVERGENT_BRANCH),
>>> +   .num_queries = 2,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm20_inst_per_wrap =
>>> +{
>>> +   .queries[0]  = _SM(INST_EXECUTED),
>>> +   .queries[1]  = _SM(WARPS_LAUNCHED),
>>> +   .num_queries = 2,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm20_inst_replay_overhead =
>>> +{
>>> +   .queries[0]  = _SM(INST_ISSUED),
>>> +   .queries[1]  = _SM(INST_EXECUTED),
>>> +   .num_queries = 2,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm20_issued_ipc =
>>> +{
>>> +   .queries[0]  = _SM(INST_ISSUED),
>>> +   .queries[1]  = _SM(ACTIVE_CYCLES),
>>> +   .num_queries = 2,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm20_ipc =
>>> +{
>>> +   .queries[0]  = _SM(INST_EXECUTED),
>>> +   .queries[1]  = _SM(ACTIVE_CYCLES),
>>> +   .num_queries = 2,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
>>> +{
>>> +   _M(ACHIEVED_OCCUPANCY,     &sm20_achieved_occupancy),
>>> +   _M(BRANCH_EFFICIENCY,      &sm20_branch_efficiency),
>>> +   _M(INST_ISSUED,            NULL),
>>> +   _M(INST_PER_WRAP,          &sm20_inst_per_wrap),
>>> +   _M(INST_REPLAY_OVERHEAD,   &sm20_inst_replay_overhead),
>>> +   _M(ISSUED_IPC,             &sm20_issued_ipc),
>>> +   _M(ISSUE_SLOTS,            NULL),
>>> +   _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc),
>>> +   _M(IPC,                    &sm20_ipc),
>>> +};
>>> +
>>> +/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm21_inst_issued =
>>> +{
>>> +   .queries[0]  = _SM(INST_ISSUED1_0),
>>> +   .queries[1]  = _SM(INST_ISSUED1_1),
>>> +   .queries[2]  = _SM(INST_ISSUED2_0),
>>> +   .queries[3]  = _SM(INST_ISSUED2_1),
>>> +   .num_queries = 4,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm21_inst_replay_overhead =
>>> +{
>>> +   .queries[0]  = _SM(INST_ISSUED1_0),
>>> +   .queries[1]  = _SM(INST_ISSUED1_1),
>>> +   .queries[2]  = _SM(INST_ISSUED2_0),
>>> +   .queries[3]  = _SM(INST_ISSUED2_1),
>>> +   .queries[4]  = _SM(INST_EXECUTED),
>>> +   .num_queries = 5,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg
>>> +sm21_issued_ipc =
>>> +{
>>> +   .queries[0]  = _SM(INST_ISSUED1_0),
>>> +   .queries[1]  = _SM(INST_ISSUED1_1),
>>> +   .queries[2]  = _SM(INST_ISSUED2_0),
>>> +   .queries[3]  = _SM(INST_ISSUED2_1),
>>> +   .queries[4]  = _SM(ACTIVE_CYCLES),
>>> +   .num_queries = 5,
>>> +};
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
>>> +{
>>> +   _M(ACHIEVED_OCCUPANCY,     &sm20_achieved_occupancy),
>>> +   _M(BRANCH_EFFICIENCY,      &sm20_branch_efficiency),
>>> +   _M(INST_ISSUED,            &sm21_inst_issued),
>>> +   _M(INST_PER_WRAP,          &sm20_inst_per_wrap),
>>> +   _M(INST_REPLAY_OVERHEAD,   &sm21_inst_replay_overhead),
>>> +   _M(ISSUED_IPC,             &sm21_issued_ipc),
>>> +   _M(ISSUE_SLOTS,            &sm21_inst_issued),
>>> +   _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc),
>>> +   _M(IPC,                    &sm20_ipc),
>>> +};
>>> +
>>> +#undef _SM
>>> +#undef _M
>>> +
>>> +static inline const struct nvc0_hw_metric_query_cfg **
>>> +nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
>>> +{
>>> +   struct nouveau_device *dev = screen->base.device;
>>> +
>>> +   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
>>> +      return sm20_hw_metric_queries;
>>> +   return sm21_hw_metric_queries;
>>> +}
>>> +
>>> +static const struct nvc0_hw_metric_query_cfg *
>>> +nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0,
>>> +                             struct nvc0_hw_query *hq)
>>> +{
>>> +   const struct nvc0_hw_metric_query_cfg **queries;
>>> +   struct nvc0_screen *screen = nvc0->screen;
>>> +   struct nvc0_query *q = &hq->base;
>>> +
>>> +   queries = nvc0_hw_metric_get_queries(screen);
>>> +   return queries[q->type - NVC0_HW_METRIC_QUERY(0)];
>>> +}
>>> +
>>> +static void
>>> +nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
>>> +                             struct nvc0_hw_query *hq)
>>> +{
>>> +   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
>>> +   unsigned i;
>>> +
>>> +   for (i = 0; i < hmq->num_queries; i++)
>>> +      hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
>>> +   FREE(hmq);
>>> +}
>>> +
>>> +static boolean
>>> +nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct
>>> nvc0_hw_query *hq)
>>> +{
>>> +   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
>>> +   boolean ret = false;
>>> +   unsigned i;
>>> +
>>> +   for (i = 0; i < hmq->num_queries; i++) {
>>> +      ret = hmq->queries[i]->funcs->begin_query(nvc0, hmq->queries[i]);
>>> +      if (!ret)
>>> +         return ret;
>>> +   }
>>> +   return ret;
>>> +}
>>> +
>>> +static void
>>> +nvc0_hw_metric_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query
>>> *hq)
>>> +{
>>> +   struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
>>> +   unsigned i;
>>> +
>>> +   for (i = 0; i < hmq->num_queries; i++)
>>> +      hmq->queries[i]->funcs->end_query(nvc0, hmq->queries[i]);
>>> +}
>>> +
>>> +static uint64_t
>>> +sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
>>> +{
>>> +   uint64_t value = 0;
>>> +
>>> +   switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
>>> +   case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
>>> +      /* (active_warps / active_cycles) / max. number of warps on a MP
>>> */
>>> +      if (res64[1])
>>> +         value = (res64[0] / (float)res64[1]) / 48;
>>
>> Why isn't this all just "return ...." and then a return 0 at the end?
>> i.e. why do you have the value variable?
>
>
> Oh yes! it's better indeed.
>
>>
>> Also I don't know how big these values get, but you might want to use
>> doubles instead of floats.
>
>
> The HUD currently only supports 64-bits integer, and float will be enough in
> my opinion.
>

OK. One issue is that float only accurately represents integers up to
1<<23, so if you do (float)res64[0] / (float)res64[1], and either is
outside that range, you'll lose accuracy on the division.


More information about the mesa-dev mailing list