[Nouveau] [PATCH 3/4] subdev/pmu/fuc: implement perf
Martin Peres
martin.peres at free.fr
Sun Feb 14 22:02:22 UTC 2016
On 26/10/15 20:13, Karol Herbst wrote:
> From: Karol Herbst <git at karolherbst.de>
>
> ---
> drm/nouveau/nvkm/subdev/pmu/fuc/gf100.fuc3.h | 788 +++++++++++++++------------
> drm/nouveau/nvkm/subdev/pmu/fuc/gf119.fuc4.h | 740 ++++++++++++++-----------
> drm/nouveau/nvkm/subdev/pmu/fuc/gk104.fuc4.h | 740 ++++++++++++++-----------
> drm/nouveau/nvkm/subdev/pmu/fuc/gk208.fuc5.h | 710 ++++++++++++++----------
> drm/nouveau/nvkm/subdev/pmu/fuc/gt215.fuc3.h | 755 ++++++++++++++-----------
> drm/nouveau/nvkm/subdev/pmu/fuc/os.h | 4 +
> drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc | 148 +++++
> 7 files changed, 2267 insertions(+), 1618 deletions(-)
>
>
> diff --git a/drm/nouveau/nvkm/subdev/pmu/fuc/os.h b/drm/nouveau/nvkm/subdev/pmu/fuc/os.h
> index c8b06cb..53508d9 100644
> --- a/drm/nouveau/nvkm/subdev/pmu/fuc/os.h
> +++ b/drm/nouveau/nvkm/subdev/pmu/fuc/os.h
> @@ -49,4 +49,8 @@
> #define I2C__MSG_DATA0_WR08_REG 0:7
> #define I2C__MSG_DATA1_WR08_VAL 0:7
>
> +
> +/* PERF: message identifiers */
> +#define PERF_MSG_LOAD 1
Could you document the expected inputs and outputs of this message as a
comment?
// IN: don't care
// OUT:
// - byte 3: PCIe load (GF100+)
// - byte 2: Memory load
// - byte 1: Video decoding engines' load
// - byte 0: GR load
> +
> #endif
> diff --git a/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc b/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc
> index 38eadf7..69a8f8d 100644
> --- a/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc
> +++ b/drm/nouveau/nvkm/subdev/pmu/fuc/perf.fuc
> @@ -30,6 +30,18 @@ process(PROC_PERF, #perf_init, #perf_recv)
> * PERF data segment
> *****************************************************************************/
> #ifdef INCLUDE_DATA
> +perf_attr_start:
> +// parameters
> +perf_polling_period_us: .b32 100000
> +
> +// engine usage percentage
It is a tiny bit misleading to say percentage when you mean per-255. How
about:
// engine usage (0-255)
> +perf_eng_gr: .b8 0
> +perf_eng_vdec: .b8 0
> +perf_eng_mc: .b8 0
> +#if NVKM_PPWR_CHIPSET >= GF100
> +perf_eng_pcie: .b8 0
> +#endif
> +.align 4
> #endif
>
> /******************************************************************************
> @@ -46,6 +58,78 @@ process(PROC_PERF, #perf_init, #perf_recv)
> // $r11 - data1
> // $r0 - zero
> perf_recv:
> + push $r1
> +
> + imm32($r10, PROC_HOST)
> + cmp b32 $r14 $r10
> + bra ne #perf_recv_not_host
> + cmp b32 $r13 PERF_MSG_LOAD
Thanks for doing this and not assuming straight away that this is going
to be the only type of messages you will receive from the host :)
> + bra e #perf_load
> + bra #perf_recv_exit
> +
> +perf_load:
> + clear b32 $r11
> + clear b32 $r12
> +#if NVKM_PPWR_CHIPSET >= GF100
> + ld(b8, $r12, #perf_eng_pcie)
> + shl b32 $r12 8
> +#endif
> + ld(b8, $r12, #perf_eng_mc)
> + shl b32 $r12 8
> + ld(b8, $r12, #perf_eng_vdec)
> + shl b32 $r12 8
> + ld(b8, $r12, #perf_eng_gr)
Simple and effective, that's nice :)
> + call(send)
> + bra #perf_recv_exit
> +
> +perf_recv_not_host:
> + call(perf_counter_readout)
> +
> + ld(b32, $r14, #perf_polling_period_us)
> + call #ticks_from_us
> + call(timer)
> +
> +perf_recv_exit:
> + pop $r1
> + ret
> +
> +
> +// description
> +//
> +// $r15 - current (perf)
> +// $r0 - zero
> +perf_counter_readout:
> + nv_iord($r14, NV_PPWR_COUNTER_COUNT(0))
> + div $r14 $r14 0xff
Not a big fan of the loss in precision. Why do you do that instead of
what I did here:
https://cgit.freedesktop.org/~mperes/nouveau/tree/nvkm/subdev/pwr/fuc/perf.fuc?h=ppwr_rework&id=27310fa8fdc39e54a3f4383fada96a3562c5a022#n134
With my solution, you get no loss in precision. And you also poll the
counters faster so it means a greater accuracy. I also miss less cycles.
All in all, could you please reuse my polling function please? :D
> +
> + nv_iord($r13, NV_PPWR_COUNTER_COUNT(1))
> + div $r13 $r13 $r14
> + st(b8, #perf_eng_gr, $r13)
> +
> + nv_iord($r13, NV_PPWR_COUNTER_COUNT(2))
> + div $r13 $r13 $r14
> + st(b8, #perf_eng_vdec, $r13)
> +
> + nv_iord($r13, NV_PPWR_COUNTER_COUNT(3))
> + div $r13 $r13 $r14
> + st(b8, #perf_eng_mc, $r13)
> +
> +#if NVKM_PPWR_CHIPSET >= GF100
> + nv_iord($r13, NV_PPWR_COUNTER_COUNT(4))
> + div $r13 $r13 $r14
> + st(b8, #perf_eng_pcie, $r13)
> +#endif
> +
> + // reset the counters
> + imm32($r14, NV_PPWR_COUNTER_COUNT_RESET)
> + nv_iowr(NV_PPWR_COUNTER_COUNT(0), $r14)
> + nv_iowr(NV_PPWR_COUNTER_COUNT(1), $r14)
> + nv_iowr(NV_PPWR_COUNTER_COUNT(2), $r14)
> + nv_iowr(NV_PPWR_COUNTER_COUNT(3), $r14)
> +#if NVKM_PPWR_CHIPSET >= GF100
> + nv_iowr(NV_PPWR_COUNTER_COUNT(4), $r14)
> +#endif
> +
> ret
>
> // description
> @@ -53,5 +137,69 @@ perf_recv:
> // $r15 - current (perf)
> // $r0 - zero
> perf_init:
> + // set up the total ticks counter first
> + imm32($r14, NV_PPWR_COUNTER_MODE_ALWAYS)
> + nv_iowr(NV_PPWR_COUNTER_MODE(0), $r14)
> +
> + // set up the other counters, with fermi there are more
> + imm32($r14, NV_PPWR_COUNTER_MODE_IF_NOT_ALL)
> + nv_iowr(NV_PPWR_COUNTER_MODE(1), $r14)
> + nv_iowr(NV_PPWR_COUNTER_MODE(2), $r14)
> + nv_iowr(NV_PPWR_COUNTER_MODE(3), $r14)
> +#if NVKM_PPWR_CHIPSET >= GF100
> + nv_iowr(NV_PPWR_COUNTER_MODE(4), $r14)
> +#endif
> +
> + // core load counter
> + imm32($r14,
> + NV_PPWR_COUNTER_SIG_GR_IDLE
> + | NV_PPWR_COUNTER_SIG_GR_GPC_IDLE
> + | NV_PPWR_COUNTER_SIG_GR_ROP_IDLE
> +#if NVKM_PPWR_CHIPSET >= GF100
> + | NV_PPWR_COUNTER_SIG_GR_HUB_IDLE
> + | NV_PPWR_COUNTER_SIG_PCOPY0_IDLE
> + | NV_PPWR_COUNTER_SIG_PCOPY1_IDLE
> +#if NVKM_PPWR_CHIPSET >= GK104
> + | NV_PPWR_COUNTER_SIG_PCOPY2_IDLE
> +#endif
> +#endif
> + )
> + nv_iowr(NV_PPWR_COUNTER_MASK(1), $r14)
> +
> + // video load counter
> + imm32($r14,
> + NV_PPWR_COUNTER_SIG_PVLD_IDLE
> + | NV_PPWR_COUNTER_SIG_PPDEC_IDLE
> + | NV_PPWR_COUNTER_SIG_PPPP_IDLE
> +#if NVKM_PPWR_CHIPSET >= GK104
> + | NV_PPWR_COUNTER_SIG_PVENC
> +#endif
> + )
> + nv_iowr(NV_PPWR_COUNTER_MASK(2), $r14)
> +
> + // memory load counter
> + imm32($r14,
> +#if NVKM_PPWR_CHIPSET >= GF100
> + NV_PPWR_COUNTER_SIG_BFB_PART0_REQ
> +#else
> + NV_PPWR_COUNTER_SIG_FB_PART0_REQ
> +#endif
> + )
> + nv_iowr(NV_PPWR_COUNTER_MASK(3), $r14)
> +
> + // pcie load counter
> +#if NVKM_PPWR_CHIPSET >= GF100
> + imm32($r14, NV_PPWR_COUNTER_SIG_PCIE)
> + nv_iowr(NV_PPWR_COUNTER_MASK(4), $r14)
> +#endif
> +
> + // initial read out
> + call(perf_counter_readout)
> +
> + // schedule the next read out
> + ld(b32, $r14, #perf_polling_period_us)
> + call #ticks_from_us
> + call(timer)
> +
> ret
> #endif
Looks about good, except the polling function!
More information about the Nouveau
mailing list