[Mesa-dev] [PATCH] nvc0: add descriptions for hardware perf counters/metrics
Samuel Pitoiset
samuel.pitoiset at gmail.com
Wed May 25 18:53:41 UTC 2016
On 05/25/2016 08:52 PM, Ilia Mirkin wrote:
> On Wed, May 25, 2016 at 2:49 PM, Samuel Pitoiset
> <samuel.pitoiset at gmail.com> wrote:
>> The GALLIUM_HUD does not yet expose a description for each events, but
>> this might be useful for developers who want to have a long description
>> of hw perf counters directly in the source code.
>>
>> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
>> ---
>> .../drivers/nouveau/nvc0/nvc0_query_hw_metric.c | 65 +++-
>> .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 354 +++++++++++++++++----
>> 2 files changed, 351 insertions(+), 68 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>> index 7ea9871..cd4ccce 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
>> @@ -24,22 +24,65 @@
>> #include "nvc0/nvc0_query_hw_metric.h"
>> #include "nvc0/nvc0_query_hw_sm.h"
>>
>> -#define _Q(i,n,t) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t }
>> +#define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d }
>> struct nvc0_hw_metric_cfg {
>> unsigned id;
>> const char *name;
>> enum pipe_driver_query_type type;
>> + const char *desc;
>> } nvc0_hw_metric_queries[] = {
>> - _Q(ACHIEVED_OCCUPANCY, "metric-achieved_occupancy", PERCENTAGE ),
>> - _Q(BRANCH_EFFICIENCY, "metric-branch_efficiency", PERCENTAGE ),
>> - _Q(INST_ISSUED, "metric-inst_issued", UINT64 ),
>> - _Q(INST_PER_WRAP, "metric-inst_per_wrap", UINT64 ),
>> - _Q(INST_REPLAY_OVERHEAD, "metric-inst_replay_overhead", UINT64 ),
>> - _Q(ISSUED_IPC, "metric-issued_ipc", UINT64 ),
>> - _Q(ISSUE_SLOTS, "metric-issue_slots", UINT64 ),
>> - _Q(ISSUE_SLOT_UTILIZATION, "metric-issue_slot_utilization", PERCENTAGE ),
>> - _Q(IPC, "metric-ipc", UINT64 ),
>> - _Q(SHARED_REPLAY_OVERHEAD, "metric-shared_replay_overhead", UINT64 ),
>> + _Q(ACHIEVED_OCCUPANCY,
>> + "metric-achieved_occupancy",
>> + PERCENTAGE,
>> + "Ratio of the average active warps per active cycle to the maximum number"
>> + "of warps supported on a multiprocessor"),
>> +
>> + _Q(BRANCH_EFFICIENCY,
>> + "metric-branch_efficiency",
>> + PERCENTAGE,
>> + "Ratio of non-divergent branches to total branches"),
>> +
>> + _Q(INST_ISSUED,
>> + "metric-inst_issued",
>> + UINT64,
>> + "The number of instructions issued"),
>> +
>> + _Q(INST_PER_WRAP,
>> + "metric-inst_per_wrap",
>> + UINT64,
>> + "Average number of instructions executed by each warp"),
>> +
>> + _Q(INST_REPLAY_OVERHEAD,
>> + "metric-inst_replay_overhead",
>> + UINT64,
>> + "Average number of replays for each instruction executed"),
>> +
>> + _Q(ISSUED_IPC,
>> + "metric-issued_ipc",
>> + UINT64,
>> + "Instructions issued per cycle"),
>> +
>> + _Q(ISSUE_SLOTS,
>> + "metric-issue_slots",
>> + UINT64,
>> + "The number of issue slots used"),
>> +
>> + _Q(ISSUE_SLOT_UTILIZATION,
>> + "metric-issue_slot_utilization",
>> + PERCENTAGE,
>> + "Percentage of issue slots that issued at least one instruction, averaged"
>> + "across all cycles"),
>> +
>> + _Q(IPC,
>> + "metric-ipc",
>> + UINT64,
>> + "Instructions executed per cycle"),
>> +
>> + _Q(SHARED_REPLAY_OVERHEAD,
>> + "metric-shared_replay_overhead",
>> + UINT64,
>> + "Average number of replays due to shared memory conflicts for each"
>> + "instruction executed"),
>> };
>>
>> #undef _Q
>> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
>> index d7659e2..9e9ecd1 100644
>> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
>> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
>> @@ -31,67 +31,307 @@
>> #include "nvc0/nvc0_compute.xml.h"
>>
>> /* NOTE: intentionally using the same names as NV */
>> -#define _Q(t, n) { NVC0_HW_SM_QUERY_##t, n }
>> +#define _Q(t, n, d) { NVC0_HW_SM_QUERY_##t, n, d }
>> struct {
>
> Can you make this static const? Or does it need to be used elsewhere?
I can do.
>
>> unsigned type;
>> const char *name;
>> + const char *desc;
>> } nvc0_hw_sm_queries[] = {
>> - _Q(ACTIVE_CYCLES, "active_cycles" ),
>> - _Q(ACTIVE_WARPS, "active_warps" ),
>> - _Q(ATOM_CAS_COUNT, "atom_cas_count" ),
>> - _Q(ATOM_COUNT, "atom_count" ),
>> - _Q(BRANCH, "branch" ),
>> - _Q(DIVERGENT_BRANCH, "divergent_branch" ),
>> - _Q(GLD_REQUEST, "gld_request" ),
>> - _Q(GLD_MEM_DIV_REPLAY, "global_ld_mem_divergence_replays" ),
>> - _Q(GST_TRANSACTIONS, "global_store_transaction" ),
>> - _Q(GST_MEM_DIV_REPLAY, "global_st_mem_divergence_replays" ),
>> - _Q(GRED_COUNT, "gred_count" ),
>> - _Q(GST_REQUEST, "gst_request" ),
>> - _Q(INST_EXECUTED, "inst_executed" ),
>> - _Q(INST_ISSUED, "inst_issued" ),
>> - _Q(INST_ISSUED1, "inst_issued1" ),
>> - _Q(INST_ISSUED2, "inst_issued2" ),
>> - _Q(INST_ISSUED1_0, "inst_issued1_0" ),
>> - _Q(INST_ISSUED1_1, "inst_issued1_1" ),
>> - _Q(INST_ISSUED2_0, "inst_issued2_0" ),
>> - _Q(INST_ISSUED2_1, "inst_issued2_1" ),
>> - _Q(L1_GLD_HIT, "l1_global_load_hit" ),
>> - _Q(L1_GLD_MISS, "l1_global_load_miss" ),
>> - _Q(L1_GLD_TRANSACTIONS, "__l1_global_load_transactions" ),
>> - _Q(L1_GST_TRANSACTIONS, "__l1_global_store_transactions" ),
>> - _Q(L1_LOCAL_LD_HIT, "l1_local_load_hit" ),
>> - _Q(L1_LOCAL_LD_MISS, "l1_local_load_miss" ),
>> - _Q(L1_LOCAL_ST_HIT, "l1_local_store_hit" ),
>> - _Q(L1_LOCAL_ST_MISS, "l1_local_store_miss" ),
>> - _Q(L1_SHARED_LD_TRANSACTIONS, "l1_shared_load_transactions" ),
>> - _Q(L1_SHARED_ST_TRANSACTIONS, "l1_shared_store_transactions" ),
>> - _Q(LOCAL_LD, "local_load" ),
>> - _Q(LOCAL_LD_TRANSACTIONS, "local_load_transactions" ),
>> - _Q(LOCAL_ST, "local_store" ),
>> - _Q(LOCAL_ST_TRANSACTIONS, "local_store_transactions" ),
>> - _Q(NOT_PRED_OFF_INST_EXECUTED, "not_predicated_off_thread_inst_executed" ),
>> - _Q(PROF_TRIGGER_0, "prof_trigger_00" ),
>> - _Q(PROF_TRIGGER_1, "prof_trigger_01" ),
>> - _Q(PROF_TRIGGER_2, "prof_trigger_02" ),
>> - _Q(PROF_TRIGGER_3, "prof_trigger_03" ),
>> - _Q(PROF_TRIGGER_4, "prof_trigger_04" ),
>> - _Q(PROF_TRIGGER_5, "prof_trigger_05" ),
>> - _Q(PROF_TRIGGER_6, "prof_trigger_06" ),
>> - _Q(PROF_TRIGGER_7, "prof_trigger_07" ),
>> - _Q(SHARED_LD, "shared_load" ),
>> - _Q(SHARED_LD_REPLAY, "shared_load_replay" ),
>> - _Q(SHARED_ST, "shared_store" ),
>> - _Q(SHARED_ST_REPLAY, "shared_store_replay" ),
>> - _Q(SM_CTA_LAUNCHED, "sm_cta_launched" ),
>> - _Q(THREADS_LAUNCHED, "threads_launched" ),
>> - _Q(TH_INST_EXECUTED, "thread_inst_executed" ),
>> - _Q(TH_INST_EXECUTED_0, "thread_inst_executed_0" ),
>> - _Q(TH_INST_EXECUTED_1, "thread_inst_executed_1" ),
>> - _Q(TH_INST_EXECUTED_2, "thread_inst_executed_2" ),
>> - _Q(TH_INST_EXECUTED_3, "thread_inst_executed_3" ),
>> - _Q(UNCACHED_GLD_TRANSACTIONS, "uncached_global_load_transaction" ),
>> - _Q(WARPS_LAUNCHED, "warps_launched" ),
>> + _Q(ACTIVE_CYCLES,
>> + "active_cycles",
>> + "Number of cycles a multiprocessor has at least one active warp"),
>> +
>> + _Q(ACTIVE_WARPS,
>> + "active_warps",
>> + "Accumulated number of active warps per cycle. For every cycle it"
>> + "increments by the number of active warps in the cycle which can be in"
>
> I think you want spaces before the end quote. Otherwise words will run
> together. Here and below.
Yep, but they are not going to be exposed for now.
Anyway, I will fix.
>
>> + "the range 0 to 64"),
>> +
>> + _Q(ATOM_CAS_COUNT,
>> + "atom_cas_count",
>> + "Number of warps executing atomic compare and swap operations. Increments"
>> + "by one if at least one thread in a warp executes the instruction."),
>> +
>> + _Q(ATOM_COUNT,
>> + "atom_count",
>> + "Number of warps executing atomic reduction operations. Increments by one"
>> + "if at least one thread in a warp executes the instruction"),
>> +
>> + _Q(BRANCH,
>> + "branch",
>> + "Number of branch instructions executed per warp on a multiprocessor"),
>> +
>> + _Q(DIVERGENT_BRANCH,
>> + "divergent_branch",
>> + "Number of divergent branches within a warp. This counter will be"
>> + "incremented by one if at least one thread in a warp diverges (that is,"
>> + "follows a different execution path) via a conditional branch"),
>> +
>> + _Q(GLD_REQUEST,
>> + "gld_request",
>> + "Number of executed load instructions where the state space is not"
>> + "specified and hence generic addressing is used, increments per warp on a"
>> + "multiprocessor. It can include the load operations from global,local and"
>> + "shared state space"),
>> +
>> + _Q(GLD_MEM_DIV_REPLAY,
>> + "global_ld_mem_divergence_replays",
>> + "Number of instruction replays for global memory loads. Instruction is"
>> + "replayed if the instruction is accessing more than one cache line of"
>> + "128 bytes. For each extra cache line access the counter is incremented"
>> + "by 1"),
>> +
>> + _Q(GST_TRANSACTIONS,
>> + "global_store_transaction",
>> + "Number of global store transactions. Increments by 1 per transaction."
>> + "Transaction can be 32/64/96/128B"),
>> +
>> + _Q(GST_MEM_DIV_REPLAY,
>> + "global_st_mem_divergence_replays",
>> + "Number of instruction replays for global memory stores. Instruction is"
>> + "replayed if the instruction is accessing more than one cache line of"
>> + "128 bytes. For each extra cache line access the counter is incremented"
>> + "by 1"),
>> +
>> + _Q(GRED_COUNT,
>> + "gred_count",
>> + "Number of warps executing reduction operations on global memory."
>> + "Increments by one if at least one thread in a warp executes the"
>> + "instruction"),
>> +
>> + _Q(GST_REQUEST,
>> + "gst_request",
>> + "Number of executed store instructions where the state space is not"
>> + "specified and hence generic addressing is used, increments per warp on a"
>> + "multiprocessor. It can include the store operations to global,local and"
>> + "shared state space"),
>> +
>> + _Q(INST_EXECUTED,
>> + "inst_executed",
>> + "Number of instructions executed, do not include replays"),
>> +
>> + _Q(INST_ISSUED,
>> + "inst_issued",
>> + "Number of instructions issued including replays"),
>> +
>> + _Q(INST_ISSUED1,
>> + "inst_issued1",
>> + "Number of single instruction issued per cycle"),
>> +
>> + _Q(INST_ISSUED2,
>> + "inst_issued2",
>> + "Number of dual instructions issued per cycle"),
>> +
>> + _Q(INST_ISSUED1_0,
>> + "inst_issued1_0",
>> + "Number of single instruction issued per cycle in pipeline 0"),
>> +
>> + _Q(INST_ISSUED1_1,
>> + "inst_issued1_1",
>> + "Number of single instruction issued per cycle in pipeline 1"),
>> +
>> + _Q(INST_ISSUED2_0,
>> + "inst_issued2_0",
>> + "Number of dual instructions issued per cycle in pipeline 0"),
>> +
>> + _Q(INST_ISSUED2_1,
>> + "inst_issued2_1",
>> + "Number of dual instructions issued per cycle in pipeline 1"),
>> +
>> + _Q(L1_GLD_HIT,
>> + "l1_global_load_hit",
>> + "Number of cache lines that hit in L1 cache for global memory load"
>> + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for"
>> + "32, 64 and 128 bit accesses by a warp respectively"),
>> +
>> + _Q(L1_GLD_MISS,
>> + "l1_global_load_miss",
>> + "Number of cache lines that miss in L1 cache for global memory load"
>> + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for"
>> + "32, 64 and 128 bit accesses by a warp respectively"),
>> +
>> + _Q(L1_GLD_TRANSACTIONS,
>> + "__l1_global_load_transactions",
>> + "Number of global load transactions from L1 cache. Increments by 1 per"
>> + "transaction. Transaction can be 32/64/96/128B"),
>> +
>> + _Q(L1_GST_TRANSACTIONS,
>> + "__l1_global_store_transactions",
>> + "Number of global store transactions from L1 cache. Increments by 1 per"
>> + "transaction. Transaction can be 32/64/96/128B"),
>> +
>> + _Q(L1_LOCAL_LD_HIT,
>> + "l1_local_load_hit",
>> + "Number of cache lines that hit in L1 cache for local memory load"
>> + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for"
>> + "32, 64 and 128 bit accesses by a warp respectively"),
>> +
>> + _Q(L1_LOCAL_LD_MISS,
>> + "l1_local_load_miss",
>> + "Number of cache lines that miss in L1 cache for local memory load"
>> + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for"
>> + "32, 64 and 128 bit accesses by a warp respectively"),
>> +
>> + _Q(L1_LOCAL_ST_HIT,
>> + "l1_local_store_hit",
>> + "Number of cache lines that hit in L1 cache for local memory store"
>> + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for"
>> + "32, 64 and 128 bit accesses by a warp respectively"),
>> +
>> + _Q(L1_LOCAL_ST_MISS,
>> + "l1_local_store_miss",
>> + "Number of cache lines that miss in L1 cache for local memory store"
>> + "accesses. In case of perfect coalescing this increments by 1,2, and 4 for"
>> + "32,64 and 128 bit accesses by a warp respectively"),
>> +
>> + _Q(L1_SHARED_LD_TRANSACTIONS,
>> + "l1_shared_load_transactions",
>> + "Number of shared load transactions. Increments by 1 per transaction."
>> + "Transaction can be 32/64/96/128B"),
>> +
>> + _Q(L1_SHARED_ST_TRANSACTIONS,
>> + "l1_shared_store_transactions",
>> + "Number of shared store transactions. Increments by 1 per transaction."
>> + "Transaction can be 32/64/96/128B"),
>> +
>> + _Q(LOCAL_LD,
>> + "local_load",
>> + "Number of executed load instructions where state space is specified as"
>> + "local, increments per warp on a multiprocessor"),
>> +
>> + _Q(LOCAL_LD_TRANSACTIONS,
>> + "local_load_transactions",
>> + "Number of local load transactions from L1 cache. Increments by 1 per"
>> + "transaction. Transaction can be 32/64/96/128B"),
>> +
>> + _Q(LOCAL_ST,
>> + "local_store",
>> + "Number of executed store instructions where state space is specified as"
>> + "local, increments per warp on a multiprocessor"),
>> +
>> + _Q(LOCAL_ST_TRANSACTIONS,
>> + "local_store_transactions",
>> + "Number of local store transactions to L1 cache. Increments by 1 per"
>> + "transaction. Transaction can be 32/64/96/128B."),
>> +
>> + _Q(NOT_PRED_OFF_INST_EXECUTED,
>> + "not_predicated_off_thread_inst_executed",
>> + "Number of not predicated off instructions executed by all threads, does"
>> + "not include replays. For each instruction it increments by the number of"
>> + "threads that execute this instruction"),
>> +
>> + _Q(PROF_TRIGGER_0,
>> + "prof_trigger_00",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_1,
>> + "prof_trigger_01",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_2,
>> + "prof_trigger_02",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_3,
>> + "prof_trigger_03",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_4,
>> + "prof_trigger_04",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_5,
>> + "prof_trigger_05",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_6,
>> + "prof_trigger_06",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(PROF_TRIGGER_7,
>> + "prof_trigger_07",
>> + "User profiled generic trigger that can be inserted in any place of the"
>> + "code to collect the related information. Increments per warp."),
>> +
>> + _Q(SHARED_LD,
>> + "shared_load",
>> + "Number of executed load instructions where state space is specified as"
>> + "shared, increments per warp on a multiprocessor"),
>> +
>> + _Q(SHARED_LD_REPLAY,
>> + "shared_load_replay",
>> + "Replays caused due to shared load bank conflict (when the addresses for"
>> + "two or more shared memory load requests fall in the same memory bank) or"
>> + "when there is no conflict but the total number of words accessed by all"
>> + "threads in the warp executing that instruction exceed the number of words"
>> + "that can be loaded in one cycle (256 bytes)"),
>> +
>> + _Q(SHARED_ST,
>> + "shared_store",
>> + "Number of executed store instructions where state space is specified as"
>> + "shared, increments per warp on a multiprocessor"),
>> +
>> + _Q(SHARED_ST_REPLAY,
>> + "shared_store_replay",
>> + "Replays caused due to shared store bank conflict (when the addresses for"
>> + "two or more shared memory store requests fall in the same memory bank) or"
>> + "when there is no conflict but the total number of words accessed by all"
>> + "threads in the warp executing that instruction exceed the number of words"
>> + "that can be stored in one cycle"),
>> +
>> + _Q(SM_CTA_LAUNCHED,
>> + "sm_cta_launched",
>> + "Number of thread blocks launched on a multiprocessor"),
>> +
>> + _Q(THREADS_LAUNCHED,
>> + "threads_launched",
>> + "Number of threads launched on a multiprocessor"),
>> +
>> + _Q(TH_INST_EXECUTED,
>> + "thread_inst_executed",
>> + "Number of instructions executed by all threads, does not include"
>> + "replays. For each instruction it increments by the number of threads in"
>> + "the warp that execute the instruction"),
>> +
>> + _Q(TH_INST_EXECUTED_0,
>> + "thread_inst_executed_0",
>> + "Number of instructions executed by all threads, does not include"
>> + "replays. For each instruction it increments by the number of threads in"
>> + "the warp that execute the instruction in pipeline 0"),
>> +
>> + _Q(TH_INST_EXECUTED_1,
>> + "thread_inst_executed_1",
>> + "Number of instructions executed by all threads, does not include"
>> + "replays. For each instruction it increments by the number of threads in"
>> + "the warp that execute the instruction in pipeline 1"),
>> +
>> + _Q(TH_INST_EXECUTED_2,
>> + "thread_inst_executed_2",
>> + "Number of instructions executed by all threads, does not include"
>> + "replays. For each instruction it increments by the number of threads in"
>> + "the warp that execute the instruction in pipeline 2"),
>> +
>> + _Q(TH_INST_EXECUTED_3,
>> + "thread_inst_executed_3",
>> + "Number of instructions executed by all threads, does not include"
>> + "replays. For each instruction it increments by the number of threads in"
>> + "the warp that execute the instruction in pipeline 3"),
>> +
>> + _Q(UNCACHED_GLD_TRANSACTIONS,
>> + "uncached_global_load_transaction",
>> + "Number of uncached global load transactions. Increments by 1 per"
>> + "transaction. Transaction can be 32/64/96/128B."),
>> +
>> + _Q(WARPS_LAUNCHED,
>> + "warps_launched",
>> + "Number of warps launched on a multiprocessor"),
>> };
>>
>> #undef _Q
>> --
>> 2.8.3
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list